001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.ExitUtil.terminate;
021
022import java.io.IOException;
023import java.util.ArrayList;
024import java.util.Collection;
025import java.util.Collections;
026import java.util.Comparator;
027import java.util.LinkedList;
028import java.util.List;
029import java.util.PriorityQueue;
030import java.util.SortedSet;
031import java.util.concurrent.CopyOnWriteArrayList;
032
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.classification.InterfaceAudience;
036import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
037import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
038import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
039
040import static org.apache.hadoop.util.ExitUtil.terminate;
041
042import com.google.common.base.Preconditions;
043import com.google.common.collect.ComparisonChain;
044import com.google.common.collect.ImmutableList;
045import com.google.common.collect.ImmutableListMultimap;
046import com.google.common.collect.Lists;
047import com.google.common.collect.Multimaps;
048import com.google.common.collect.Sets;
049
050/**
051 * Manages a collection of Journals. None of the methods are synchronized, it is
052 * assumed that FSEditLog methods, that use this class, use proper
053 * synchronization.
054 */
055public class JournalSet implements JournalManager {
056
057  static final Log LOG = LogFactory.getLog(FSEditLog.class);
058  
059  static final public Comparator<EditLogInputStream>
060    EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() {
061      @Override
062      public int compare(EditLogInputStream a, EditLogInputStream b) {
063        return ComparisonChain.start().
064          compare(a.getFirstTxId(), b.getFirstTxId()).
065          compare(b.getLastTxId(), a.getLastTxId()).
066          result();
067      }
068    };
069  
070  /**
071   * Container for a JournalManager paired with its currently
072   * active stream.
073   * 
074   * If a Journal gets disabled due to an error writing to its
075   * stream, then the stream will be aborted and set to null.
076   */
077  static class JournalAndStream implements CheckableNameNodeResource {
078    private final JournalManager journal;
079    private boolean disabled = false;
080    private EditLogOutputStream stream;
081    private boolean required = false;
082    
083    public JournalAndStream(JournalManager manager, boolean required) {
084      this.journal = manager;
085      this.required = required;
086    }
087
088    public void startLogSegment(long txId) throws IOException {
089      Preconditions.checkState(stream == null);
090      disabled = false;
091      stream = journal.startLogSegment(txId);
092    }
093
094    /**
095     * Closes the stream, also sets it to null.
096     */
097    public void closeStream() throws IOException {
098      if (stream == null) return;
099      stream.close();
100      stream = null;
101    }
102
103    /**
104     * Close the Journal and Stream
105     */
106    public void close() throws IOException {
107      closeStream();
108
109      journal.close();
110    }
111    
112    /**
113     * Aborts the stream, also sets it to null.
114     */
115    public void abort() {
116      if (stream == null) return;
117      try {
118        stream.abort();
119      } catch (IOException ioe) {
120        LOG.error("Unable to abort stream " + stream, ioe);
121      }
122      stream = null;
123    }
124
125    boolean isActive() {
126      return stream != null;
127    }
128    
129    /**
130     * Should be used outside JournalSet only for testing.
131     */
132    EditLogOutputStream getCurrentStream() {
133      return stream;
134    }
135    
136    @Override
137    public String toString() {
138      return "JournalAndStream(mgr=" + journal +
139        ", " + "stream=" + stream + ")";
140    }
141
142    void setCurrentStreamForTests(EditLogOutputStream stream) {
143      this.stream = stream;
144    }
145    
146    JournalManager getManager() {
147      return journal;
148    }
149
150    boolean isDisabled() {
151      return disabled;
152    }
153
154    private void setDisabled(boolean disabled) {
155      this.disabled = disabled;
156    }
157    
158    @Override
159    public boolean isResourceAvailable() {
160      return !isDisabled();
161    }
162    
163    @Override
164    public boolean isRequired() {
165      return required;
166    }
167  }
168 
169  // COW implementation is necessary since some users (eg the web ui) call
170  // getAllJournalStreams() and then iterate. Since this is rarely
171  // mutated, there is no performance concern.
172  private List<JournalAndStream> journals =
173      new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
174  final int minimumRedundantJournals;
175  
176  JournalSet(int minimumRedundantResources) {
177    this.minimumRedundantJournals = minimumRedundantResources;
178  }
179  
180  @Override
181  public void format(NamespaceInfo nsInfo) throws IOException {
182    // The iteration is done by FSEditLog itself
183    throw new UnsupportedOperationException();
184  }
185
186  @Override
187  public boolean hasSomeData() throws IOException {
188    // This is called individually on the underlying journals,
189    // not on the JournalSet.
190    throw new UnsupportedOperationException();
191  }
192
193  
194  @Override
195  public EditLogOutputStream startLogSegment(final long txId) throws IOException {
196    mapJournalsAndReportErrors(new JournalClosure() {
197      @Override
198      public void apply(JournalAndStream jas) throws IOException {
199        jas.startLogSegment(txId);
200      }
201    }, "starting log segment " + txId);
202    return new JournalSetOutputStream();
203  }
204  
205  @Override
206  public void finalizeLogSegment(final long firstTxId, final long lastTxId)
207      throws IOException {
208    mapJournalsAndReportErrors(new JournalClosure() {
209      @Override
210      public void apply(JournalAndStream jas) throws IOException {
211        if (jas.isActive()) {
212          jas.closeStream();
213          jas.getManager().finalizeLogSegment(firstTxId, lastTxId);
214        }
215      }
216    }, "finalize log segment " + firstTxId + ", " + lastTxId);
217  }
218   
219  @Override
220  public void close() throws IOException {
221    mapJournalsAndReportErrors(new JournalClosure() {
222      @Override
223      public void apply(JournalAndStream jas) throws IOException {
224        jas.close();
225      }
226    }, "close journal");
227  }
228
229  /**
230   * In this function, we get a bunch of streams from all of our JournalManager
231   * objects.  Then we add these to the collection one by one.
232   * 
233   * @param streams          The collection to add the streams to.  It may or 
234   *                         may not be sorted-- this is up to the caller.
235   * @param fromTxId         The transaction ID to start looking for streams at
236   * @param inProgressOk     Should we consider unfinalized streams?
237   */
238  @Override
239  public void selectInputStreams(Collection<EditLogInputStream> streams,
240      long fromTxId, boolean inProgressOk) throws IOException {
241    final PriorityQueue<EditLogInputStream> allStreams = 
242        new PriorityQueue<EditLogInputStream>(64,
243            EDIT_LOG_INPUT_STREAM_COMPARATOR);
244    for (JournalAndStream jas : journals) {
245      if (jas.isDisabled()) {
246        LOG.info("Skipping jas " + jas + " since it's disabled");
247        continue;
248      }
249      try {
250        jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk);
251      } catch (IOException ioe) {
252        LOG.warn("Unable to determine input streams from " + jas.getManager() +
253            ". Skipping.", ioe);
254      }
255    }
256    chainAndMakeRedundantStreams(streams, allStreams, fromTxId);
257  }
258  
259  public static void chainAndMakeRedundantStreams(
260      Collection<EditLogInputStream> outStreams,
261      PriorityQueue<EditLogInputStream> allStreams, long fromTxId) {
262    // We want to group together all the streams that start on the same start
263    // transaction ID.  To do this, we maintain an accumulator (acc) of all
264    // the streams we've seen at a given start transaction ID.  When we see a
265    // higher start transaction ID, we select a stream from the accumulator and
266    // clear it.  Then we begin accumulating streams with the new, higher start
267    // transaction ID.
268    LinkedList<EditLogInputStream> acc =
269        new LinkedList<EditLogInputStream>();
270    EditLogInputStream elis;
271    while ((elis = allStreams.poll()) != null) {
272      if (acc.isEmpty()) {
273        acc.add(elis);
274      } else {
275        long accFirstTxId = acc.get(0).getFirstTxId();
276        if (accFirstTxId == elis.getFirstTxId()) {
277          acc.add(elis);
278        } else if (accFirstTxId < elis.getFirstTxId()) {
279          outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
280          acc.clear();
281          acc.add(elis);
282        } else if (accFirstTxId > elis.getFirstTxId()) {
283          throw new RuntimeException("sorted set invariants violated!  " +
284              "Got stream with first txid " + elis.getFirstTxId() +
285              ", but the last firstTxId was " + accFirstTxId);
286        }
287      }
288    }
289    if (!acc.isEmpty()) {
290      outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
291      acc.clear();
292    }
293  }
294
295  /**
296   * Returns true if there are no journals, all redundant journals are disabled,
297   * or any required journals are disabled.
298   * 
299   * @return True if there no journals, all redundant journals are disabled,
300   * or any required journals are disabled.
301   */
302  public boolean isEmpty() {
303    return !NameNodeResourcePolicy.areResourcesAvailable(journals,
304        minimumRedundantJournals);
305  }
306  
307  /**
308   * Called when some journals experience an error in some operation.
309   */
310  private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
311    if (badJournals == null || badJournals.isEmpty()) {
312      return; // nothing to do
313    }
314 
315    for (JournalAndStream j : badJournals) {
316      LOG.error("Disabling journal " + j);
317      j.abort();
318      j.setDisabled(true);
319    }
320  }
321
322  /**
323   * Implementations of this interface encapsulate operations that can be
324   * iteratively applied on all the journals. For example see
325   * {@link JournalSet#mapJournalsAndReportErrors}.
326   */
327  private interface JournalClosure {
328    /**
329     * The operation on JournalAndStream.
330     * @param jas Object on which operations are performed.
331     * @throws IOException
332     */
333    public void apply(JournalAndStream jas) throws IOException;
334  }
335  
336  /**
337   * Apply the given operation across all of the journal managers, disabling
338   * any for which the closure throws an IOException.
339   * @param closure {@link JournalClosure} object encapsulating the operation.
340   * @param status message used for logging errors (e.g. "opening journal")
341   * @throws IOException If the operation fails on all the journals.
342   */
343  private void mapJournalsAndReportErrors(
344      JournalClosure closure, String status) throws IOException{
345
346    List<JournalAndStream> badJAS = Lists.newLinkedList();
347    for (JournalAndStream jas : journals) {
348      try {
349        closure.apply(jas);
350      } catch (Throwable t) {
351        if (jas.isRequired()) {
352          final String msg = "Error: " + status + " failed for required journal ("
353            + jas + ")";
354          LOG.fatal(msg, t);
355          // If we fail on *any* of the required journals, then we must not
356          // continue on any of the other journals. Abort them to ensure that
357          // retry behavior doesn't allow them to keep going in any way.
358          abortAllJournals();
359          // the current policy is to shutdown the NN on errors to shared edits
360          // dir. There are many code paths to shared edits failures - syncs,
361          // roll of edits etc. All of them go through this common function 
362          // where the isRequired() check is made. Applying exit policy here 
363          // to catch all code paths.
364          terminate(1, msg);
365        } else {
366          LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
367          badJAS.add(jas);          
368        }
369      }
370    }
371    disableAndReportErrorOnJournals(badJAS);
372    if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
373        minimumRedundantJournals)) {
374      String message = status + " failed for too many journals";
375      LOG.error("Error: " + message);
376      throw new IOException(message);
377    }
378  }
379  
380  /**
381   * Abort all of the underlying streams.
382   */
383  private void abortAllJournals() {
384    for (JournalAndStream jas : journals) {
385      if (jas.isActive()) {
386        jas.abort();
387      }
388    }
389  }
390
391  /**
392   * An implementation of EditLogOutputStream that applies a requested method on
393   * all the journals that are currently active.
394   */
395  private class JournalSetOutputStream extends EditLogOutputStream {
396
397    JournalSetOutputStream() throws IOException {
398      super();
399    }
400
401    @Override
402    public void write(final FSEditLogOp op)
403        throws IOException {
404      mapJournalsAndReportErrors(new JournalClosure() {
405        @Override
406        public void apply(JournalAndStream jas) throws IOException {
407          if (jas.isActive()) {
408            jas.getCurrentStream().write(op);
409          }
410        }
411      }, "write op");
412    }
413
414    @Override
415    public void writeRaw(final byte[] data, final int offset, final int length)
416        throws IOException {
417      mapJournalsAndReportErrors(new JournalClosure() {
418        @Override
419        public void apply(JournalAndStream jas) throws IOException {
420          if (jas.isActive()) {
421            jas.getCurrentStream().writeRaw(data, offset, length);
422          }
423        }
424      }, "write bytes");
425    }
426
427    @Override
428    public void create() throws IOException {
429      mapJournalsAndReportErrors(new JournalClosure() {
430        @Override
431        public void apply(JournalAndStream jas) throws IOException {
432          if (jas.isActive()) {
433            jas.getCurrentStream().create();
434          }
435        }
436      }, "create");
437    }
438
439    @Override
440    public void close() throws IOException {
441      mapJournalsAndReportErrors(new JournalClosure() {
442        @Override
443        public void apply(JournalAndStream jas) throws IOException {
444          jas.closeStream();
445        }
446      }, "close");
447    }
448
449    @Override
450    public void abort() throws IOException {
451      mapJournalsAndReportErrors(new JournalClosure() {
452        @Override
453        public void apply(JournalAndStream jas) throws IOException {
454          jas.abort();
455        }
456      }, "abort");
457    }
458
459    @Override
460    public void setReadyToFlush() throws IOException {
461      mapJournalsAndReportErrors(new JournalClosure() {
462        @Override
463        public void apply(JournalAndStream jas) throws IOException {
464          if (jas.isActive()) {
465            jas.getCurrentStream().setReadyToFlush();
466          }
467        }
468      }, "setReadyToFlush");
469    }
470
471    @Override
472    protected void flushAndSync(final boolean durable) throws IOException {
473      mapJournalsAndReportErrors(new JournalClosure() {
474        @Override
475        public void apply(JournalAndStream jas) throws IOException {
476          if (jas.isActive()) {
477            jas.getCurrentStream().flushAndSync(durable);
478          }
479        }
480      }, "flushAndSync");
481    }
482    
483    @Override
484    public void flush() throws IOException {
485      mapJournalsAndReportErrors(new JournalClosure() {
486        @Override
487        public void apply(JournalAndStream jas) throws IOException {
488          if (jas.isActive()) {
489            jas.getCurrentStream().flush();
490          }
491        }
492      }, "flush");
493    }
494    
495    @Override
496    public boolean shouldForceSync() {
497      for (JournalAndStream js : journals) {
498        if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
499          return true;
500        }
501      }
502      return false;
503    }
504    
505    @Override
506    protected long getNumSync() {
507      for (JournalAndStream jas : journals) {
508        if (jas.isActive()) {
509          return jas.getCurrentStream().getNumSync();
510        }
511      }
512      return 0;
513    }
514  }
515
516  @Override
517  public void setOutputBufferCapacity(final int size) {
518    try {
519      mapJournalsAndReportErrors(new JournalClosure() {
520        @Override
521        public void apply(JournalAndStream jas) throws IOException {
522            jas.getManager().setOutputBufferCapacity(size);
523        }
524      }, "setOutputBufferCapacity");
525    } catch (IOException e) {
526      LOG.error("Error in setting outputbuffer capacity");
527    }
528  }
529  
530  List<JournalAndStream> getAllJournalStreams() {
531    return journals;
532  }
533
534  List<JournalManager> getJournalManagers() {
535    List<JournalManager> jList = new ArrayList<JournalManager>();
536    for (JournalAndStream j : journals) {
537      jList.add(j.getManager());
538    }
539    return jList;
540  }
541
542  void add(JournalManager j, boolean required) {
543    JournalAndStream jas = new JournalAndStream(j, required);
544    journals.add(jas);
545  }
546  
547  void remove(JournalManager j) {
548    JournalAndStream jasToRemove = null;
549    for (JournalAndStream jas: journals) {
550      if (jas.getManager().equals(j)) {
551        jasToRemove = jas;
552        break;
553      }
554    }
555    if (jasToRemove != null) {
556      jasToRemove.abort();
557      journals.remove(jasToRemove);
558    }
559  }
560
561  @Override
562  public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException {
563    mapJournalsAndReportErrors(new JournalClosure() {
564      @Override
565      public void apply(JournalAndStream jas) throws IOException {
566        jas.getManager().purgeLogsOlderThan(minTxIdToKeep);
567      }
568    }, "purgeLogsOlderThan " + minTxIdToKeep);
569  }
570
571  @Override
572  public void recoverUnfinalizedSegments() throws IOException {
573    mapJournalsAndReportErrors(new JournalClosure() {
574      @Override
575      public void apply(JournalAndStream jas) throws IOException {
576        jas.getManager().recoverUnfinalizedSegments();
577      }
578    }, "recoverUnfinalizedSegments");
579  }
580  
581  /**
582   * Return a manifest of what finalized edit logs are available. All available
583   * edit logs are returned starting from the transaction id passed. If
584   * 'fromTxId' falls in the middle of a log, that log is returned as well.
585   * 
586   * @param fromTxId Starting transaction id to read the logs.
587   * @return RemoteEditLogManifest object.
588   */
589  public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId) {
590    // Collect RemoteEditLogs available from each FileJournalManager
591    List<RemoteEditLog> allLogs = Lists.newArrayList();
592    for (JournalAndStream j : journals) {
593      if (j.getManager() instanceof FileJournalManager) {
594        FileJournalManager fjm = (FileJournalManager)j.getManager();
595        try {
596          allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, false));
597        } catch (Throwable t) {
598          LOG.warn("Cannot list edit logs in " + fjm, t);
599        }
600      }
601    }
602    
603    // Group logs by their starting txid
604    ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId =
605      Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID);
606    long curStartTxId = fromTxId;
607
608    List<RemoteEditLog> logs = Lists.newArrayList();
609    while (true) {
610      ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId);
611      if (logGroup.isEmpty()) {
612        // we have a gap in logs - for example because we recovered some old
613        // storage directory with ancient logs. Clear out any logs we've
614        // accumulated so far, and then skip to the next segment of logs
615        // after the gap.
616        SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet());
617        startTxIds = startTxIds.tailSet(curStartTxId);
618        if (startTxIds.isEmpty()) {
619          break;
620        } else {
621          if (LOG.isDebugEnabled()) {
622            LOG.debug("Found gap in logs at " + curStartTxId + ": " +
623                "not returning previous logs in manifest.");
624          }
625          logs.clear();
626          curStartTxId = startTxIds.first();
627          continue;
628        }
629      }
630
631      // Find the one that extends the farthest forward
632      RemoteEditLog bestLog = Collections.max(logGroup);
633      logs.add(bestLog);
634      // And then start looking from after that point
635      curStartTxId = bestLog.getEndTxId() + 1;
636    }
637    RemoteEditLogManifest ret = new RemoteEditLogManifest(logs);
638    
639    if (LOG.isDebugEnabled()) {
640      LOG.debug("Generated manifest for logs since " + fromTxId + ":"
641          + ret);      
642    }
643    return ret;
644  }
645
646  /**
647   * Add sync times to the buffer.
648   */
649  String getSyncTimes() {
650    StringBuilder buf = new StringBuilder();
651    for (JournalAndStream jas : journals) {
652      if (jas.isActive()) {
653        buf.append(jas.getCurrentStream().getTotalSyncTime());
654        buf.append(" ");
655      }
656    }
657    return buf.toString();
658  }
659}