001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.blockmanagement; 019 020import java.util.ArrayList; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedList; 026import java.util.List; 027import java.util.Map; 028import java.util.Queue; 029 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.apache.hadoop.classification.InterfaceAudience; 033import org.apache.hadoop.classification.InterfaceStability; 034import org.apache.hadoop.hdfs.protocol.Block; 035import org.apache.hadoop.hdfs.protocol.DatanodeID; 036import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 037import org.apache.hadoop.hdfs.server.namenode.CachedBlock; 038import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; 039import org.apache.hadoop.hdfs.server.protocol.StorageReport; 040import org.apache.hadoop.hdfs.util.LightWeightHashSet; 041import org.apache.hadoop.util.IntrusiveCollection; 042import org.apache.hadoop.util.Time; 043 044import com.google.common.annotations.VisibleForTesting; 045 046/** 047 * This class extends the DatanodeInfo class with ephemeral information (eg 048 * health, capacity, what blocks are associated with the Datanode) that is 049 * private to the Namenode, ie this class is not exposed to clients. 050 */ 051@InterfaceAudience.Private 052@InterfaceStability.Evolving 053public class DatanodeDescriptor extends DatanodeInfo { 054 public static final Log LOG = LogFactory.getLog(DatanodeDescriptor.class); 055 public static final DatanodeDescriptor[] EMPTY_ARRAY = {}; 056 057 // Stores status of decommissioning. 058 // If node is not decommissioning, do not use this object for anything. 059 public DecommissioningStatus decommissioningStatus = new DecommissioningStatus(); 060 061 /** Block and targets pair */ 062 @InterfaceAudience.Private 063 @InterfaceStability.Evolving 064 public static class BlockTargetPair { 065 public final Block block; 066 public final DatanodeStorageInfo[] targets; 067 068 BlockTargetPair(Block block, DatanodeStorageInfo[] targets) { 069 this.block = block; 070 this.targets = targets; 071 } 072 } 073 074 /** A BlockTargetPair queue. */ 075 private static class BlockQueue<E> { 076 private final Queue<E> blockq = new LinkedList<E>(); 077 078 /** Size of the queue */ 079 synchronized int size() {return blockq.size();} 080 081 /** Enqueue */ 082 synchronized boolean offer(E e) { 083 return blockq.offer(e); 084 } 085 086 /** Dequeue */ 087 synchronized List<E> poll(int numBlocks) { 088 if (numBlocks <= 0 || blockq.isEmpty()) { 089 return null; 090 } 091 092 List<E> results = new ArrayList<E>(); 093 for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) { 094 results.add(blockq.poll()); 095 } 096 return results; 097 } 098 099 /** 100 * Returns <tt>true</tt> if the queue contains the specified element. 101 */ 102 boolean contains(E e) { 103 return blockq.contains(e); 104 } 105 106 synchronized void clear() { 107 blockq.clear(); 108 } 109 } 110 111 private final Map<String, DatanodeStorageInfo> storageMap = 112 new HashMap<String, DatanodeStorageInfo>(); 113 114 /** 115 * A list of CachedBlock objects on this datanode. 116 */ 117 public static class CachedBlocksList extends IntrusiveCollection<CachedBlock> { 118 public enum Type { 119 PENDING_CACHED, 120 CACHED, 121 PENDING_UNCACHED 122 } 123 124 private final DatanodeDescriptor datanode; 125 126 private final Type type; 127 128 CachedBlocksList(DatanodeDescriptor datanode, Type type) { 129 this.datanode = datanode; 130 this.type = type; 131 } 132 133 public DatanodeDescriptor getDatanode() { 134 return datanode; 135 } 136 137 public Type getType() { 138 return type; 139 } 140 } 141 142 /** 143 * The blocks which we want to cache on this DataNode. 144 */ 145 private final CachedBlocksList pendingCached = 146 new CachedBlocksList(this, CachedBlocksList.Type.PENDING_CACHED); 147 148 /** 149 * The blocks which we know are cached on this datanode. 150 * This list is updated by periodic cache reports. 151 */ 152 private final CachedBlocksList cached = 153 new CachedBlocksList(this, CachedBlocksList.Type.CACHED); 154 155 /** 156 * The blocks which we want to uncache on this DataNode. 157 */ 158 private final CachedBlocksList pendingUncached = 159 new CachedBlocksList(this, CachedBlocksList.Type.PENDING_UNCACHED); 160 161 public CachedBlocksList getPendingCached() { 162 return pendingCached; 163 } 164 165 public CachedBlocksList getCached() { 166 return cached; 167 } 168 169 public CachedBlocksList getPendingUncached() { 170 return pendingUncached; 171 } 172 173 /** 174 * The time when the last batch of caching directives was sent, in 175 * monotonic milliseconds. 176 */ 177 private long lastCachingDirectiveSentTimeMs; 178 179 // isAlive == heartbeats.contains(this) 180 // This is an optimization, because contains takes O(n) time on Arraylist 181 public boolean isAlive = false; 182 public boolean needKeyUpdate = false; 183 184 185 // A system administrator can tune the balancer bandwidth parameter 186 // (dfs.balance.bandwidthPerSec) dynamically by calling 187 // "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the 188 // following 'bandwidth' variable gets updated with the new value for each 189 // node. Once the heartbeat command is issued to update the value on the 190 // specified datanode, this value will be set back to 0. 191 private long bandwidth; 192 193 /** A queue of blocks to be replicated by this datanode */ 194 private BlockQueue<BlockTargetPair> replicateBlocks = new BlockQueue<BlockTargetPair>(); 195 /** A queue of blocks to be recovered by this datanode */ 196 private BlockQueue<BlockInfoUnderConstruction> recoverBlocks = 197 new BlockQueue<BlockInfoUnderConstruction>(); 198 /** A set of blocks to be invalidated by this datanode */ 199 private LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>(); 200 201 /* Variables for maintaining number of blocks scheduled to be written to 202 * this storage. This count is approximate and might be slightly bigger 203 * in case of errors (e.g. datanode does not report if an error occurs 204 * while writing the block). 205 */ 206 private int currApproxBlocksScheduled = 0; 207 private int prevApproxBlocksScheduled = 0; 208 private long lastBlocksScheduledRollTime = 0; 209 private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min 210 private int volumeFailures = 0; 211 212 /** 213 * When set to true, the node is not in include list and is not allowed 214 * to communicate with the namenode 215 */ 216 private boolean disallowed = false; 217 218 /** 219 * DatanodeDescriptor constructor 220 * @param nodeID id of the data node 221 */ 222 public DatanodeDescriptor(DatanodeID nodeID) { 223 super(nodeID); 224 updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0); 225 } 226 227 /** 228 * DatanodeDescriptor constructor 229 * @param nodeID id of the data node 230 * @param networkLocation location of the data node in network 231 */ 232 public DatanodeDescriptor(DatanodeID nodeID, 233 String networkLocation) { 234 super(nodeID, networkLocation); 235 updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0); 236 } 237 238 /** 239 * Add data-node to the block. Add block to the head of the list of blocks 240 * belonging to the data-node. 241 */ 242 public boolean addBlock(String storageID, BlockInfo b) { 243 DatanodeStorageInfo s = getStorageInfo(storageID); 244 if (s != null) { 245 return s.addBlock(b); 246 } 247 return false; 248 } 249 250 DatanodeStorageInfo getStorageInfo(String storageID) { 251 synchronized (storageMap) { 252 return storageMap.get(storageID); 253 } 254 } 255 DatanodeStorageInfo[] getStorageInfos() { 256 synchronized (storageMap) { 257 final Collection<DatanodeStorageInfo> storages = storageMap.values(); 258 return storages.toArray(new DatanodeStorageInfo[storages.size()]); 259 } 260 } 261 262 /** 263 * Remove block from the list of blocks belonging to the data-node. Remove 264 * data-node from the block. 265 */ 266 boolean removeBlock(BlockInfo b) { 267 int index = b.findStorageInfo(this); 268 // if block exists on this datanode 269 if (index >= 0) { 270 DatanodeStorageInfo s = b.getStorageInfo(index); 271 if (s != null) { 272 return s.removeBlock(b); 273 } 274 } 275 return false; 276 } 277 278 /** 279 * Remove block from the list of blocks belonging to the data-node. Remove 280 * data-node from the block. 281 */ 282 boolean removeBlock(String storageID, BlockInfo b) { 283 DatanodeStorageInfo s = getStorageInfo(storageID); 284 if (s != null) { 285 return s.removeBlock(b); 286 } 287 return false; 288 } 289 290 /** 291 * Replace specified old block with a new one in the DataNodeDescriptor. 292 * 293 * @param oldBlock - block to be replaced 294 * @param newBlock - a replacement block 295 * @return the new block 296 */ 297 public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) { 298 int index = oldBlock.findStorageInfo(this); 299 DatanodeStorageInfo s = oldBlock.getStorageInfo(index); 300 boolean done = s.removeBlock(oldBlock); 301 assert done : "Old block should belong to the data-node when replacing"; 302 303 done = s.addBlock(newBlock); 304 assert done : "New block should not belong to the data-node when replacing"; 305 return newBlock; 306 } 307 308 public void resetBlocks() { 309 setCapacity(0); 310 setRemaining(0); 311 setBlockPoolUsed(0); 312 setDfsUsed(0); 313 setXceiverCount(0); 314 this.invalidateBlocks.clear(); 315 this.volumeFailures = 0; 316 // pendingCached, cached, and pendingUncached are protected by the 317 // FSN lock. 318 this.pendingCached.clear(); 319 this.cached.clear(); 320 this.pendingUncached.clear(); 321 } 322 323 public void clearBlockQueues() { 324 synchronized (invalidateBlocks) { 325 this.invalidateBlocks.clear(); 326 this.recoverBlocks.clear(); 327 this.replicateBlocks.clear(); 328 } 329 // pendingCached, cached, and pendingUncached are protected by the 330 // FSN lock. 331 this.pendingCached.clear(); 332 this.cached.clear(); 333 this.pendingUncached.clear(); 334 } 335 336 public int numBlocks() { 337 int blocks = 0; 338 for (DatanodeStorageInfo entry : getStorageInfos()) { 339 blocks += entry.numBlocks(); 340 } 341 return blocks; 342 } 343 344 /** 345 * Updates stats from datanode heartbeat. 346 */ 347 public void updateHeartbeat(StorageReport[] reports, long cacheCapacity, 348 long cacheUsed, int xceiverCount, int volFailures) { 349 long totalCapacity = 0; 350 long totalRemaining = 0; 351 long totalBlockPoolUsed = 0; 352 long totalDfsUsed = 0; 353 354 setCacheCapacity(cacheCapacity); 355 setCacheUsed(cacheUsed); 356 setXceiverCount(xceiverCount); 357 setLastUpdate(Time.now()); 358 this.volumeFailures = volFailures; 359 for (StorageReport report : reports) { 360 DatanodeStorageInfo storage = storageMap.get(report.getStorage().getStorageID()); 361 if (storage == null) { 362 // This is seen during cluster initialization when the heartbeat 363 // is received before the initial block reports from each storage. 364 storage = updateStorage(report.getStorage()); 365 } 366 storage.receivedHeartbeat(report); 367 totalCapacity += report.getCapacity(); 368 totalRemaining += report.getRemaining(); 369 totalBlockPoolUsed += report.getBlockPoolUsed(); 370 totalDfsUsed += report.getDfsUsed(); 371 } 372 rollBlocksScheduled(getLastUpdate()); 373 374 // Update total metrics for the node. 375 setCapacity(totalCapacity); 376 setRemaining(totalRemaining); 377 setBlockPoolUsed(totalBlockPoolUsed); 378 setDfsUsed(totalDfsUsed); 379 } 380 381 private static class BlockIterator implements Iterator<BlockInfo> { 382 private int index = 0; 383 private final List<Iterator<BlockInfo>> iterators; 384 385 private BlockIterator(final DatanodeStorageInfo... storages) { 386 List<Iterator<BlockInfo>> iterators = new ArrayList<Iterator<BlockInfo>>(); 387 for (DatanodeStorageInfo e : storages) { 388 iterators.add(e.getBlockIterator()); 389 } 390 this.iterators = Collections.unmodifiableList(iterators); 391 } 392 393 @Override 394 public boolean hasNext() { 395 update(); 396 return !iterators.isEmpty() && iterators.get(index).hasNext(); 397 } 398 399 @Override 400 public BlockInfo next() { 401 update(); 402 return iterators.get(index).next(); 403 } 404 405 @Override 406 public void remove() { 407 throw new UnsupportedOperationException("Remove unsupported."); 408 } 409 410 private void update() { 411 while(index < iterators.size() - 1 && !iterators.get(index).hasNext()) { 412 index++; 413 } 414 } 415 } 416 417 Iterator<BlockInfo> getBlockIterator() { 418 return new BlockIterator(getStorageInfos()); 419 } 420 Iterator<BlockInfo> getBlockIterator(final String storageID) { 421 return new BlockIterator(getStorageInfo(storageID)); 422 } 423 424 /** 425 * Store block replication work. 426 */ 427 void addBlockToBeReplicated(Block block, DatanodeStorageInfo[] targets) { 428 assert(block != null && targets != null && targets.length > 0); 429 replicateBlocks.offer(new BlockTargetPair(block, targets)); 430 } 431 432 /** 433 * Store block recovery work. 434 */ 435 void addBlockToBeRecovered(BlockInfoUnderConstruction block) { 436 if(recoverBlocks.contains(block)) { 437 // this prevents adding the same block twice to the recovery queue 438 BlockManager.LOG.info(block + " is already in the recovery queue"); 439 return; 440 } 441 recoverBlocks.offer(block); 442 } 443 444 /** 445 * Store block invalidation work. 446 */ 447 void addBlocksToBeInvalidated(List<Block> blocklist) { 448 assert(blocklist != null && blocklist.size() > 0); 449 synchronized (invalidateBlocks) { 450 for(Block blk : blocklist) { 451 invalidateBlocks.add(blk); 452 } 453 } 454 } 455 456 /** 457 * The number of work items that are pending to be replicated 458 */ 459 int getNumberOfBlocksToBeReplicated() { 460 return replicateBlocks.size(); 461 } 462 463 /** 464 * The number of block invalidation items that are pending to 465 * be sent to the datanode 466 */ 467 int getNumberOfBlocksToBeInvalidated() { 468 synchronized (invalidateBlocks) { 469 return invalidateBlocks.size(); 470 } 471 } 472 473 public List<BlockTargetPair> getReplicationCommand(int maxTransfers) { 474 return replicateBlocks.poll(maxTransfers); 475 } 476 477 public BlockInfoUnderConstruction[] getLeaseRecoveryCommand(int maxTransfers) { 478 List<BlockInfoUnderConstruction> blocks = recoverBlocks.poll(maxTransfers); 479 if(blocks == null) 480 return null; 481 return blocks.toArray(new BlockInfoUnderConstruction[blocks.size()]); 482 } 483 484 /** 485 * Remove the specified number of blocks to be invalidated 486 */ 487 public Block[] getInvalidateBlocks(int maxblocks) { 488 synchronized (invalidateBlocks) { 489 Block[] deleteList = invalidateBlocks.pollToArray(new Block[Math.min( 490 invalidateBlocks.size(), maxblocks)]); 491 return deleteList.length == 0 ? null : deleteList; 492 } 493 } 494 495 /** 496 * @return Approximate number of blocks currently scheduled to be written 497 * to this datanode. 498 */ 499 public int getBlocksScheduled() { 500 return currApproxBlocksScheduled + prevApproxBlocksScheduled; 501 } 502 503 /** Increment the number of blocks scheduled. */ 504 void incrementBlocksScheduled() { 505 currApproxBlocksScheduled++; 506 } 507 508 /** Decrement the number of blocks scheduled. */ 509 void decrementBlocksScheduled() { 510 if (prevApproxBlocksScheduled > 0) { 511 prevApproxBlocksScheduled--; 512 } else if (currApproxBlocksScheduled > 0) { 513 currApproxBlocksScheduled--; 514 } 515 // its ok if both counters are zero. 516 } 517 518 /** Adjusts curr and prev number of blocks scheduled every few minutes. */ 519 private void rollBlocksScheduled(long now) { 520 if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) { 521 prevApproxBlocksScheduled = currApproxBlocksScheduled; 522 currApproxBlocksScheduled = 0; 523 lastBlocksScheduledRollTime = now; 524 } 525 } 526 527 @Override 528 public int hashCode() { 529 // Super implementation is sufficient 530 return super.hashCode(); 531 } 532 533 @Override 534 public boolean equals(Object obj) { 535 // Sufficient to use super equality as datanodes are uniquely identified 536 // by DatanodeID 537 return (this == obj) || super.equals(obj); 538 } 539 540 /** Decommissioning status */ 541 public class DecommissioningStatus { 542 private int underReplicatedBlocks; 543 private int decommissionOnlyReplicas; 544 private int underReplicatedInOpenFiles; 545 private long startTime; 546 547 synchronized void set(int underRep, 548 int onlyRep, int underConstruction) { 549 if (isDecommissionInProgress() == false) { 550 return; 551 } 552 underReplicatedBlocks = underRep; 553 decommissionOnlyReplicas = onlyRep; 554 underReplicatedInOpenFiles = underConstruction; 555 } 556 557 /** @return the number of under-replicated blocks */ 558 public synchronized int getUnderReplicatedBlocks() { 559 if (isDecommissionInProgress() == false) { 560 return 0; 561 } 562 return underReplicatedBlocks; 563 } 564 /** @return the number of decommission-only replicas */ 565 public synchronized int getDecommissionOnlyReplicas() { 566 if (isDecommissionInProgress() == false) { 567 return 0; 568 } 569 return decommissionOnlyReplicas; 570 } 571 /** @return the number of under-replicated blocks in open files */ 572 public synchronized int getUnderReplicatedInOpenFiles() { 573 if (isDecommissionInProgress() == false) { 574 return 0; 575 } 576 return underReplicatedInOpenFiles; 577 } 578 /** Set start time */ 579 public synchronized void setStartTime(long time) { 580 startTime = time; 581 } 582 /** @return start time */ 583 public synchronized long getStartTime() { 584 if (isDecommissionInProgress() == false) { 585 return 0; 586 } 587 return startTime; 588 } 589 } // End of class DecommissioningStatus 590 591 /** 592 * Set the flag to indicate if this datanode is disallowed from communicating 593 * with the namenode. 594 */ 595 public void setDisallowed(boolean flag) { 596 disallowed = flag; 597 } 598 /** Is the datanode disallowed from communicating with the namenode? */ 599 public boolean isDisallowed() { 600 return disallowed; 601 } 602 603 /** 604 * @return number of failed volumes in the datanode. 605 */ 606 public int getVolumeFailures() { 607 return volumeFailures; 608 } 609 610 /** 611 * @param nodeReg DatanodeID to update registration for. 612 */ 613 @Override 614 public void updateRegInfo(DatanodeID nodeReg) { 615 super.updateRegInfo(nodeReg); 616 617 // must re-process IBR after re-registration 618 for(DatanodeStorageInfo storage : getStorageInfos()) { 619 storage.setBlockReportCount(0); 620 } 621 } 622 623 /** 624 * @return balancer bandwidth in bytes per second for this datanode 625 */ 626 public long getBalancerBandwidth() { 627 return this.bandwidth; 628 } 629 630 /** 631 * @param bandwidth balancer bandwidth in bytes per second for this datanode 632 */ 633 public void setBalancerBandwidth(long bandwidth) { 634 this.bandwidth = bandwidth; 635 } 636 637 @Override 638 public String dumpDatanode() { 639 StringBuilder sb = new StringBuilder(super.dumpDatanode()); 640 int repl = replicateBlocks.size(); 641 if (repl > 0) { 642 sb.append(" ").append(repl).append(" blocks to be replicated;"); 643 } 644 int inval = invalidateBlocks.size(); 645 if (inval > 0) { 646 sb.append(" ").append(inval).append(" blocks to be invalidated;"); 647 } 648 int recover = recoverBlocks.size(); 649 if (recover > 0) { 650 sb.append(" ").append(recover).append(" blocks to be recovered;"); 651 } 652 return sb.toString(); 653 } 654 655 DatanodeStorageInfo updateStorage(DatanodeStorage s) { 656 synchronized (storageMap) { 657 DatanodeStorageInfo storage = storageMap.get(s.getStorageID()); 658 if (storage == null) { 659 LOG.info("Adding new storage ID " + s.getStorageID() + 660 " for DN " + getXferAddr()); 661 storage = new DatanodeStorageInfo(this, s); 662 storageMap.put(s.getStorageID(), storage); 663 } 664 return storage; 665 } 666 } 667 668 /** 669 * @return The time at which we last sent caching directives to this 670 * DataNode, in monotonic milliseconds. 671 */ 672 public long getLastCachingDirectiveSentTimeMs() { 673 return this.lastCachingDirectiveSentTimeMs; 674 } 675 676 /** 677 * @param time The time at which we last sent caching directives to this 678 * DataNode, in monotonic milliseconds. 679 */ 680 public void setLastCachingDirectiveSentTimeMs(long time) { 681 this.lastCachingDirectiveSentTimeMs = time; 682 } 683} 684