001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.util.ExitUtil.terminate; 021 022import java.io.IOException; 023import java.util.ArrayList; 024import java.util.Collection; 025import java.util.Collections; 026import java.util.Comparator; 027import java.util.LinkedList; 028import java.util.List; 029import java.util.PriorityQueue; 030import java.util.SortedSet; 031import java.util.concurrent.CopyOnWriteArrayList; 032 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.apache.hadoop.classification.InterfaceAudience; 036import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; 037import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; 038import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; 039 040import static org.apache.hadoop.util.ExitUtil.terminate; 041 042import com.google.common.base.Preconditions; 043import com.google.common.collect.ComparisonChain; 044import com.google.common.collect.ImmutableList; 045import com.google.common.collect.ImmutableListMultimap; 046import com.google.common.collect.Lists; 047import com.google.common.collect.Multimaps; 048import com.google.common.collect.Sets; 049 050/** 051 * Manages a collection of Journals. None of the methods are synchronized, it is 052 * assumed that FSEditLog methods, that use this class, use proper 053 * synchronization. 054 */ 055public class JournalSet implements JournalManager { 056 057 static final Log LOG = LogFactory.getLog(FSEditLog.class); 058 059 static final public Comparator<EditLogInputStream> 060 EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() { 061 @Override 062 public int compare(EditLogInputStream a, EditLogInputStream b) { 063 return ComparisonChain.start(). 064 compare(a.getFirstTxId(), b.getFirstTxId()). 065 compare(b.getLastTxId(), a.getLastTxId()). 066 result(); 067 } 068 }; 069 070 /** 071 * Container for a JournalManager paired with its currently 072 * active stream. 073 * 074 * If a Journal gets disabled due to an error writing to its 075 * stream, then the stream will be aborted and set to null. 076 */ 077 static class JournalAndStream implements CheckableNameNodeResource { 078 private final JournalManager journal; 079 private boolean disabled = false; 080 private EditLogOutputStream stream; 081 private boolean required = false; 082 083 public JournalAndStream(JournalManager manager, boolean required) { 084 this.journal = manager; 085 this.required = required; 086 } 087 088 public void startLogSegment(long txId) throws IOException { 089 Preconditions.checkState(stream == null); 090 disabled = false; 091 stream = journal.startLogSegment(txId); 092 } 093 094 /** 095 * Closes the stream, also sets it to null. 096 */ 097 public void closeStream() throws IOException { 098 if (stream == null) return; 099 stream.close(); 100 stream = null; 101 } 102 103 /** 104 * Close the Journal and Stream 105 */ 106 public void close() throws IOException { 107 closeStream(); 108 109 journal.close(); 110 } 111 112 /** 113 * Aborts the stream, also sets it to null. 114 */ 115 public void abort() { 116 if (stream == null) return; 117 try { 118 stream.abort(); 119 } catch (IOException ioe) { 120 LOG.error("Unable to abort stream " + stream, ioe); 121 } 122 stream = null; 123 } 124 125 boolean isActive() { 126 return stream != null; 127 } 128 129 /** 130 * Should be used outside JournalSet only for testing. 131 */ 132 EditLogOutputStream getCurrentStream() { 133 return stream; 134 } 135 136 @Override 137 public String toString() { 138 return "JournalAndStream(mgr=" + journal + 139 ", " + "stream=" + stream + ")"; 140 } 141 142 void setCurrentStreamForTests(EditLogOutputStream stream) { 143 this.stream = stream; 144 } 145 146 JournalManager getManager() { 147 return journal; 148 } 149 150 boolean isDisabled() { 151 return disabled; 152 } 153 154 private void setDisabled(boolean disabled) { 155 this.disabled = disabled; 156 } 157 158 @Override 159 public boolean isResourceAvailable() { 160 return !isDisabled(); 161 } 162 163 @Override 164 public boolean isRequired() { 165 return required; 166 } 167 } 168 169 // COW implementation is necessary since some users (eg the web ui) call 170 // getAllJournalStreams() and then iterate. Since this is rarely 171 // mutated, there is no performance concern. 172 private List<JournalAndStream> journals = 173 new CopyOnWriteArrayList<JournalSet.JournalAndStream>(); 174 final int minimumRedundantJournals; 175 176 JournalSet(int minimumRedundantResources) { 177 this.minimumRedundantJournals = minimumRedundantResources; 178 } 179 180 @Override 181 public void format(NamespaceInfo nsInfo) throws IOException { 182 // The iteration is done by FSEditLog itself 183 throw new UnsupportedOperationException(); 184 } 185 186 @Override 187 public boolean hasSomeData() throws IOException { 188 // This is called individually on the underlying journals, 189 // not on the JournalSet. 190 throw new UnsupportedOperationException(); 191 } 192 193 194 @Override 195 public EditLogOutputStream startLogSegment(final long txId) throws IOException { 196 mapJournalsAndReportErrors(new JournalClosure() { 197 @Override 198 public void apply(JournalAndStream jas) throws IOException { 199 jas.startLogSegment(txId); 200 } 201 }, "starting log segment " + txId); 202 return new JournalSetOutputStream(); 203 } 204 205 @Override 206 public void finalizeLogSegment(final long firstTxId, final long lastTxId) 207 throws IOException { 208 mapJournalsAndReportErrors(new JournalClosure() { 209 @Override 210 public void apply(JournalAndStream jas) throws IOException { 211 if (jas.isActive()) { 212 jas.closeStream(); 213 jas.getManager().finalizeLogSegment(firstTxId, lastTxId); 214 } 215 } 216 }, "finalize log segment " + firstTxId + ", " + lastTxId); 217 } 218 219 @Override 220 public void close() throws IOException { 221 mapJournalsAndReportErrors(new JournalClosure() { 222 @Override 223 public void apply(JournalAndStream jas) throws IOException { 224 jas.close(); 225 } 226 }, "close journal"); 227 } 228 229 /** 230 * In this function, we get a bunch of streams from all of our JournalManager 231 * objects. Then we add these to the collection one by one. 232 * 233 * @param streams The collection to add the streams to. It may or 234 * may not be sorted-- this is up to the caller. 235 * @param fromTxId The transaction ID to start looking for streams at 236 * @param inProgressOk Should we consider unfinalized streams? 237 */ 238 @Override 239 public void selectInputStreams(Collection<EditLogInputStream> streams, 240 long fromTxId, boolean inProgressOk) throws IOException { 241 final PriorityQueue<EditLogInputStream> allStreams = 242 new PriorityQueue<EditLogInputStream>(64, 243 EDIT_LOG_INPUT_STREAM_COMPARATOR); 244 for (JournalAndStream jas : journals) { 245 if (jas.isDisabled()) { 246 LOG.info("Skipping jas " + jas + " since it's disabled"); 247 continue; 248 } 249 try { 250 jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk); 251 } catch (IOException ioe) { 252 LOG.warn("Unable to determine input streams from " + jas.getManager() + 253 ". Skipping.", ioe); 254 } 255 } 256 chainAndMakeRedundantStreams(streams, allStreams, fromTxId); 257 } 258 259 public static void chainAndMakeRedundantStreams( 260 Collection<EditLogInputStream> outStreams, 261 PriorityQueue<EditLogInputStream> allStreams, long fromTxId) { 262 // We want to group together all the streams that start on the same start 263 // transaction ID. To do this, we maintain an accumulator (acc) of all 264 // the streams we've seen at a given start transaction ID. When we see a 265 // higher start transaction ID, we select a stream from the accumulator and 266 // clear it. Then we begin accumulating streams with the new, higher start 267 // transaction ID. 268 LinkedList<EditLogInputStream> acc = 269 new LinkedList<EditLogInputStream>(); 270 EditLogInputStream elis; 271 while ((elis = allStreams.poll()) != null) { 272 if (acc.isEmpty()) { 273 acc.add(elis); 274 } else { 275 long accFirstTxId = acc.get(0).getFirstTxId(); 276 if (accFirstTxId == elis.getFirstTxId()) { 277 acc.add(elis); 278 } else if (accFirstTxId < elis.getFirstTxId()) { 279 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId)); 280 acc.clear(); 281 acc.add(elis); 282 } else if (accFirstTxId > elis.getFirstTxId()) { 283 throw new RuntimeException("sorted set invariants violated! " + 284 "Got stream with first txid " + elis.getFirstTxId() + 285 ", but the last firstTxId was " + accFirstTxId); 286 } 287 } 288 } 289 if (!acc.isEmpty()) { 290 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId)); 291 acc.clear(); 292 } 293 } 294 295 /** 296 * Returns true if there are no journals, all redundant journals are disabled, 297 * or any required journals are disabled. 298 * 299 * @return True if there no journals, all redundant journals are disabled, 300 * or any required journals are disabled. 301 */ 302 public boolean isEmpty() { 303 return !NameNodeResourcePolicy.areResourcesAvailable(journals, 304 minimumRedundantJournals); 305 } 306 307 /** 308 * Called when some journals experience an error in some operation. 309 */ 310 private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) { 311 if (badJournals == null || badJournals.isEmpty()) { 312 return; // nothing to do 313 } 314 315 for (JournalAndStream j : badJournals) { 316 LOG.error("Disabling journal " + j); 317 j.abort(); 318 j.setDisabled(true); 319 } 320 } 321 322 /** 323 * Implementations of this interface encapsulate operations that can be 324 * iteratively applied on all the journals. For example see 325 * {@link JournalSet#mapJournalsAndReportErrors}. 326 */ 327 private interface JournalClosure { 328 /** 329 * The operation on JournalAndStream. 330 * @param jas Object on which operations are performed. 331 * @throws IOException 332 */ 333 public void apply(JournalAndStream jas) throws IOException; 334 } 335 336 /** 337 * Apply the given operation across all of the journal managers, disabling 338 * any for which the closure throws an IOException. 339 * @param closure {@link JournalClosure} object encapsulating the operation. 340 * @param status message used for logging errors (e.g. "opening journal") 341 * @throws IOException If the operation fails on all the journals. 342 */ 343 private void mapJournalsAndReportErrors( 344 JournalClosure closure, String status) throws IOException{ 345 346 List<JournalAndStream> badJAS = Lists.newLinkedList(); 347 for (JournalAndStream jas : journals) { 348 try { 349 closure.apply(jas); 350 } catch (Throwable t) { 351 if (jas.isRequired()) { 352 final String msg = "Error: " + status + " failed for required journal (" 353 + jas + ")"; 354 LOG.fatal(msg, t); 355 // If we fail on *any* of the required journals, then we must not 356 // continue on any of the other journals. Abort them to ensure that 357 // retry behavior doesn't allow them to keep going in any way. 358 abortAllJournals(); 359 // the current policy is to shutdown the NN on errors to shared edits 360 // dir. There are many code paths to shared edits failures - syncs, 361 // roll of edits etc. All of them go through this common function 362 // where the isRequired() check is made. Applying exit policy here 363 // to catch all code paths. 364 terminate(1, msg); 365 } else { 366 LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); 367 badJAS.add(jas); 368 } 369 } 370 } 371 disableAndReportErrorOnJournals(badJAS); 372 if (!NameNodeResourcePolicy.areResourcesAvailable(journals, 373 minimumRedundantJournals)) { 374 String message = status + " failed for too many journals"; 375 LOG.error("Error: " + message); 376 throw new IOException(message); 377 } 378 } 379 380 /** 381 * Abort all of the underlying streams. 382 */ 383 private void abortAllJournals() { 384 for (JournalAndStream jas : journals) { 385 if (jas.isActive()) { 386 jas.abort(); 387 } 388 } 389 } 390 391 /** 392 * An implementation of EditLogOutputStream that applies a requested method on 393 * all the journals that are currently active. 394 */ 395 private class JournalSetOutputStream extends EditLogOutputStream { 396 397 JournalSetOutputStream() throws IOException { 398 super(); 399 } 400 401 @Override 402 public void write(final FSEditLogOp op) 403 throws IOException { 404 mapJournalsAndReportErrors(new JournalClosure() { 405 @Override 406 public void apply(JournalAndStream jas) throws IOException { 407 if (jas.isActive()) { 408 jas.getCurrentStream().write(op); 409 } 410 } 411 }, "write op"); 412 } 413 414 @Override 415 public void writeRaw(final byte[] data, final int offset, final int length) 416 throws IOException { 417 mapJournalsAndReportErrors(new JournalClosure() { 418 @Override 419 public void apply(JournalAndStream jas) throws IOException { 420 if (jas.isActive()) { 421 jas.getCurrentStream().writeRaw(data, offset, length); 422 } 423 } 424 }, "write bytes"); 425 } 426 427 @Override 428 public void create() throws IOException { 429 mapJournalsAndReportErrors(new JournalClosure() { 430 @Override 431 public void apply(JournalAndStream jas) throws IOException { 432 if (jas.isActive()) { 433 jas.getCurrentStream().create(); 434 } 435 } 436 }, "create"); 437 } 438 439 @Override 440 public void close() throws IOException { 441 mapJournalsAndReportErrors(new JournalClosure() { 442 @Override 443 public void apply(JournalAndStream jas) throws IOException { 444 jas.closeStream(); 445 } 446 }, "close"); 447 } 448 449 @Override 450 public void abort() throws IOException { 451 mapJournalsAndReportErrors(new JournalClosure() { 452 @Override 453 public void apply(JournalAndStream jas) throws IOException { 454 jas.abort(); 455 } 456 }, "abort"); 457 } 458 459 @Override 460 public void setReadyToFlush() throws IOException { 461 mapJournalsAndReportErrors(new JournalClosure() { 462 @Override 463 public void apply(JournalAndStream jas) throws IOException { 464 if (jas.isActive()) { 465 jas.getCurrentStream().setReadyToFlush(); 466 } 467 } 468 }, "setReadyToFlush"); 469 } 470 471 @Override 472 protected void flushAndSync(final boolean durable) throws IOException { 473 mapJournalsAndReportErrors(new JournalClosure() { 474 @Override 475 public void apply(JournalAndStream jas) throws IOException { 476 if (jas.isActive()) { 477 jas.getCurrentStream().flushAndSync(durable); 478 } 479 } 480 }, "flushAndSync"); 481 } 482 483 @Override 484 public void flush() throws IOException { 485 mapJournalsAndReportErrors(new JournalClosure() { 486 @Override 487 public void apply(JournalAndStream jas) throws IOException { 488 if (jas.isActive()) { 489 jas.getCurrentStream().flush(); 490 } 491 } 492 }, "flush"); 493 } 494 495 @Override 496 public boolean shouldForceSync() { 497 for (JournalAndStream js : journals) { 498 if (js.isActive() && js.getCurrentStream().shouldForceSync()) { 499 return true; 500 } 501 } 502 return false; 503 } 504 505 @Override 506 protected long getNumSync() { 507 for (JournalAndStream jas : journals) { 508 if (jas.isActive()) { 509 return jas.getCurrentStream().getNumSync(); 510 } 511 } 512 return 0; 513 } 514 } 515 516 @Override 517 public void setOutputBufferCapacity(final int size) { 518 try { 519 mapJournalsAndReportErrors(new JournalClosure() { 520 @Override 521 public void apply(JournalAndStream jas) throws IOException { 522 jas.getManager().setOutputBufferCapacity(size); 523 } 524 }, "setOutputBufferCapacity"); 525 } catch (IOException e) { 526 LOG.error("Error in setting outputbuffer capacity"); 527 } 528 } 529 530 List<JournalAndStream> getAllJournalStreams() { 531 return journals; 532 } 533 534 List<JournalManager> getJournalManagers() { 535 List<JournalManager> jList = new ArrayList<JournalManager>(); 536 for (JournalAndStream j : journals) { 537 jList.add(j.getManager()); 538 } 539 return jList; 540 } 541 542 void add(JournalManager j, boolean required) { 543 JournalAndStream jas = new JournalAndStream(j, required); 544 journals.add(jas); 545 } 546 547 void remove(JournalManager j) { 548 JournalAndStream jasToRemove = null; 549 for (JournalAndStream jas: journals) { 550 if (jas.getManager().equals(j)) { 551 jasToRemove = jas; 552 break; 553 } 554 } 555 if (jasToRemove != null) { 556 jasToRemove.abort(); 557 journals.remove(jasToRemove); 558 } 559 } 560 561 @Override 562 public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException { 563 mapJournalsAndReportErrors(new JournalClosure() { 564 @Override 565 public void apply(JournalAndStream jas) throws IOException { 566 jas.getManager().purgeLogsOlderThan(minTxIdToKeep); 567 } 568 }, "purgeLogsOlderThan " + minTxIdToKeep); 569 } 570 571 @Override 572 public void recoverUnfinalizedSegments() throws IOException { 573 mapJournalsAndReportErrors(new JournalClosure() { 574 @Override 575 public void apply(JournalAndStream jas) throws IOException { 576 jas.getManager().recoverUnfinalizedSegments(); 577 } 578 }, "recoverUnfinalizedSegments"); 579 } 580 581 /** 582 * Return a manifest of what finalized edit logs are available. All available 583 * edit logs are returned starting from the transaction id passed. If 584 * 'fromTxId' falls in the middle of a log, that log is returned as well. 585 * 586 * @param fromTxId Starting transaction id to read the logs. 587 * @return RemoteEditLogManifest object. 588 */ 589 public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId) { 590 // Collect RemoteEditLogs available from each FileJournalManager 591 List<RemoteEditLog> allLogs = Lists.newArrayList(); 592 for (JournalAndStream j : journals) { 593 if (j.getManager() instanceof FileJournalManager) { 594 FileJournalManager fjm = (FileJournalManager)j.getManager(); 595 try { 596 allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, false)); 597 } catch (Throwable t) { 598 LOG.warn("Cannot list edit logs in " + fjm, t); 599 } 600 } 601 } 602 603 // Group logs by their starting txid 604 ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId = 605 Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID); 606 long curStartTxId = fromTxId; 607 608 List<RemoteEditLog> logs = Lists.newArrayList(); 609 while (true) { 610 ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId); 611 if (logGroup.isEmpty()) { 612 // we have a gap in logs - for example because we recovered some old 613 // storage directory with ancient logs. Clear out any logs we've 614 // accumulated so far, and then skip to the next segment of logs 615 // after the gap. 616 SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet()); 617 startTxIds = startTxIds.tailSet(curStartTxId); 618 if (startTxIds.isEmpty()) { 619 break; 620 } else { 621 if (LOG.isDebugEnabled()) { 622 LOG.debug("Found gap in logs at " + curStartTxId + ": " + 623 "not returning previous logs in manifest."); 624 } 625 logs.clear(); 626 curStartTxId = startTxIds.first(); 627 continue; 628 } 629 } 630 631 // Find the one that extends the farthest forward 632 RemoteEditLog bestLog = Collections.max(logGroup); 633 logs.add(bestLog); 634 // And then start looking from after that point 635 curStartTxId = bestLog.getEndTxId() + 1; 636 } 637 RemoteEditLogManifest ret = new RemoteEditLogManifest(logs); 638 639 if (LOG.isDebugEnabled()) { 640 LOG.debug("Generated manifest for logs since " + fromTxId + ":" 641 + ret); 642 } 643 return ret; 644 } 645 646 /** 647 * Add sync times to the buffer. 648 */ 649 String getSyncTimes() { 650 StringBuilder buf = new StringBuilder(); 651 for (JournalAndStream jas : journals) { 652 if (jas.isActive()) { 653 buf.append(jas.getCurrentStream().getTotalSyncTime()); 654 buf.append(" "); 655 } 656 } 657 return buf.toString(); 658 } 659}