001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.blockmanagement;
019
020import java.util.ArrayList;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedList;
026import java.util.List;
027import java.util.Map;
028import java.util.Queue;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.classification.InterfaceStability;
034import org.apache.hadoop.hdfs.protocol.Block;
035import org.apache.hadoop.hdfs.protocol.DatanodeID;
036import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
037import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
038import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
039import org.apache.hadoop.hdfs.server.protocol.StorageReport;
040import org.apache.hadoop.hdfs.util.LightWeightHashSet;
041import org.apache.hadoop.util.IntrusiveCollection;
042import org.apache.hadoop.util.Time;
043
044import com.google.common.annotations.VisibleForTesting;
045
046/**
047 * This class extends the DatanodeInfo class with ephemeral information (eg
048 * health, capacity, what blocks are associated with the Datanode) that is
049 * private to the Namenode, ie this class is not exposed to clients.
050 */
051@InterfaceAudience.Private
052@InterfaceStability.Evolving
053public class DatanodeDescriptor extends DatanodeInfo {
054  public static final Log LOG = LogFactory.getLog(DatanodeDescriptor.class);
055  public static final DatanodeDescriptor[] EMPTY_ARRAY = {};
056
057  // Stores status of decommissioning.
058  // If node is not decommissioning, do not use this object for anything.
059  public DecommissioningStatus decommissioningStatus = new DecommissioningStatus();
060  
061  /** Block and targets pair */
062  @InterfaceAudience.Private
063  @InterfaceStability.Evolving
064  public static class BlockTargetPair {
065    public final Block block;
066    public final DatanodeStorageInfo[] targets;    
067
068    BlockTargetPair(Block block, DatanodeStorageInfo[] targets) {
069      this.block = block;
070      this.targets = targets;
071    }
072  }
073
074  /** A BlockTargetPair queue. */
075  private static class BlockQueue<E> {
076    private final Queue<E> blockq = new LinkedList<E>();
077
078    /** Size of the queue */
079    synchronized int size() {return blockq.size();}
080
081    /** Enqueue */
082    synchronized boolean offer(E e) { 
083      return blockq.offer(e);
084    }
085
086    /** Dequeue */
087    synchronized List<E> poll(int numBlocks) {
088      if (numBlocks <= 0 || blockq.isEmpty()) {
089        return null;
090      }
091
092      List<E> results = new ArrayList<E>();
093      for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) {
094        results.add(blockq.poll());
095      }
096      return results;
097    }
098
099    /**
100     * Returns <tt>true</tt> if the queue contains the specified element.
101     */
102    boolean contains(E e) {
103      return blockq.contains(e);
104    }
105
106    synchronized void clear() {
107      blockq.clear();
108    }
109  }
110
111  private final Map<String, DatanodeStorageInfo> storageMap = 
112      new HashMap<String, DatanodeStorageInfo>();
113
114  /**
115   * A list of CachedBlock objects on this datanode.
116   */
117  public static class CachedBlocksList extends IntrusiveCollection<CachedBlock> {
118    public enum Type {
119      PENDING_CACHED,
120      CACHED,
121      PENDING_UNCACHED
122    }
123
124    private final DatanodeDescriptor datanode;
125
126    private final Type type;
127
128    CachedBlocksList(DatanodeDescriptor datanode, Type type) {
129      this.datanode = datanode;
130      this.type = type;
131    }
132
133    public DatanodeDescriptor getDatanode() {
134      return datanode;
135    }
136
137    public Type getType() {
138      return type;
139    }
140  }
141
142  /**
143   * The blocks which we want to cache on this DataNode.
144   */
145  private final CachedBlocksList pendingCached = 
146      new CachedBlocksList(this, CachedBlocksList.Type.PENDING_CACHED);
147
148  /**
149   * The blocks which we know are cached on this datanode.
150   * This list is updated by periodic cache reports.
151   */
152  private final CachedBlocksList cached = 
153      new CachedBlocksList(this, CachedBlocksList.Type.CACHED);
154
155  /**
156   * The blocks which we want to uncache on this DataNode.
157   */
158  private final CachedBlocksList pendingUncached = 
159      new CachedBlocksList(this, CachedBlocksList.Type.PENDING_UNCACHED);
160
161  public CachedBlocksList getPendingCached() {
162    return pendingCached;
163  }
164
165  public CachedBlocksList getCached() {
166    return cached;
167  }
168
169  public CachedBlocksList getPendingUncached() {
170    return pendingUncached;
171  }
172
173  /**
174   * The time when the last batch of caching directives was sent, in
175   * monotonic milliseconds.
176   */
177  private long lastCachingDirectiveSentTimeMs;
178
179  // isAlive == heartbeats.contains(this)
180  // This is an optimization, because contains takes O(n) time on Arraylist
181  public boolean isAlive = false;
182  public boolean needKeyUpdate = false;
183
184  
185  // A system administrator can tune the balancer bandwidth parameter
186  // (dfs.balance.bandwidthPerSec) dynamically by calling
187  // "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
188  // following 'bandwidth' variable gets updated with the new value for each
189  // node. Once the heartbeat command is issued to update the value on the
190  // specified datanode, this value will be set back to 0.
191  private long bandwidth;
192
193  /** A queue of blocks to be replicated by this datanode */
194  private BlockQueue<BlockTargetPair> replicateBlocks = new BlockQueue<BlockTargetPair>();
195  /** A queue of blocks to be recovered by this datanode */
196  private BlockQueue<BlockInfoUnderConstruction> recoverBlocks =
197                                new BlockQueue<BlockInfoUnderConstruction>();
198  /** A set of blocks to be invalidated by this datanode */
199  private LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>();
200
201  /* Variables for maintaining number of blocks scheduled to be written to
202   * this storage. This count is approximate and might be slightly bigger
203   * in case of errors (e.g. datanode does not report if an error occurs
204   * while writing the block).
205   */
206  private int currApproxBlocksScheduled = 0;
207  private int prevApproxBlocksScheduled = 0;
208  private long lastBlocksScheduledRollTime = 0;
209  private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
210  private int volumeFailures = 0;
211  
212  /** 
213   * When set to true, the node is not in include list and is not allowed
214   * to communicate with the namenode
215   */
216  private boolean disallowed = false;
217
218  /**
219   * DatanodeDescriptor constructor
220   * @param nodeID id of the data node
221   */
222  public DatanodeDescriptor(DatanodeID nodeID) {
223    super(nodeID);
224    updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
225  }
226
227  /**
228   * DatanodeDescriptor constructor
229   * @param nodeID id of the data node
230   * @param networkLocation location of the data node in network
231   */
232  public DatanodeDescriptor(DatanodeID nodeID, 
233                            String networkLocation) {
234    super(nodeID, networkLocation);
235    updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
236  }
237
238  /**
239   * Add data-node to the block. Add block to the head of the list of blocks
240   * belonging to the data-node.
241   */
242  public boolean addBlock(String storageID, BlockInfo b) {
243    DatanodeStorageInfo s = getStorageInfo(storageID);
244    if (s != null) {
245      return s.addBlock(b);
246    }
247    return false;
248  }
249
250  DatanodeStorageInfo getStorageInfo(String storageID) {
251    synchronized (storageMap) {
252      return storageMap.get(storageID);
253    }
254  }
255  DatanodeStorageInfo[] getStorageInfos() {
256    synchronized (storageMap) {
257      final Collection<DatanodeStorageInfo> storages = storageMap.values();
258      return storages.toArray(new DatanodeStorageInfo[storages.size()]);
259    }
260  }
261
262  /**
263   * Remove block from the list of blocks belonging to the data-node. Remove
264   * data-node from the block.
265   */
266  boolean removeBlock(BlockInfo b) {
267    int index = b.findStorageInfo(this);
268    // if block exists on this datanode
269    if (index >= 0) {
270      DatanodeStorageInfo s = b.getStorageInfo(index);
271      if (s != null) {
272        return s.removeBlock(b);
273      }
274    }
275    return false;
276  }
277  
278  /**
279   * Remove block from the list of blocks belonging to the data-node. Remove
280   * data-node from the block.
281   */
282  boolean removeBlock(String storageID, BlockInfo b) {
283    DatanodeStorageInfo s = getStorageInfo(storageID);
284    if (s != null) {
285      return s.removeBlock(b);
286    }
287    return false;
288  }
289
290  /**
291   * Replace specified old block with a new one in the DataNodeDescriptor.
292   *
293   * @param oldBlock - block to be replaced
294   * @param newBlock - a replacement block
295   * @return the new block
296   */
297  public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) {
298    int index = oldBlock.findStorageInfo(this);
299    DatanodeStorageInfo s = oldBlock.getStorageInfo(index);
300    boolean done = s.removeBlock(oldBlock);
301    assert done : "Old block should belong to the data-node when replacing";
302
303    done = s.addBlock(newBlock);
304    assert done : "New block should not belong to the data-node when replacing";
305    return newBlock;
306  }
307
308  public void resetBlocks() {
309    setCapacity(0);
310    setRemaining(0);
311    setBlockPoolUsed(0);
312    setDfsUsed(0);
313    setXceiverCount(0);
314    this.invalidateBlocks.clear();
315    this.volumeFailures = 0;
316    // pendingCached, cached, and pendingUncached are protected by the
317    // FSN lock.
318    this.pendingCached.clear();
319    this.cached.clear();
320    this.pendingUncached.clear();
321  }
322  
323  public void clearBlockQueues() {
324    synchronized (invalidateBlocks) {
325      this.invalidateBlocks.clear();
326      this.recoverBlocks.clear();
327      this.replicateBlocks.clear();
328    }
329    // pendingCached, cached, and pendingUncached are protected by the
330    // FSN lock.
331    this.pendingCached.clear();
332    this.cached.clear();
333    this.pendingUncached.clear();
334  }
335
336  public int numBlocks() {
337    int blocks = 0;
338    for (DatanodeStorageInfo entry : getStorageInfos()) {
339      blocks += entry.numBlocks();
340    }
341    return blocks;
342  }
343
344  /**
345   * Updates stats from datanode heartbeat.
346   */
347  public void updateHeartbeat(StorageReport[] reports, long cacheCapacity,
348      long cacheUsed, int xceiverCount, int volFailures) {
349    long totalCapacity = 0;
350    long totalRemaining = 0;
351    long totalBlockPoolUsed = 0;
352    long totalDfsUsed = 0;
353
354    setCacheCapacity(cacheCapacity);
355    setCacheUsed(cacheUsed);
356    setXceiverCount(xceiverCount);
357    setLastUpdate(Time.now());    
358    this.volumeFailures = volFailures;
359    for (StorageReport report : reports) {
360      DatanodeStorageInfo storage = storageMap.get(report.getStorage().getStorageID());
361      if (storage == null) {
362        // This is seen during cluster initialization when the heartbeat
363        // is received before the initial block reports from each storage.
364        storage = updateStorage(report.getStorage());
365      }
366      storage.receivedHeartbeat(report);
367      totalCapacity += report.getCapacity();
368      totalRemaining += report.getRemaining();
369      totalBlockPoolUsed += report.getBlockPoolUsed();
370      totalDfsUsed += report.getDfsUsed();
371    }
372    rollBlocksScheduled(getLastUpdate());
373
374    // Update total metrics for the node.
375    setCapacity(totalCapacity);
376    setRemaining(totalRemaining);
377    setBlockPoolUsed(totalBlockPoolUsed);
378    setDfsUsed(totalDfsUsed);
379  }
380
381  private static class BlockIterator implements Iterator<BlockInfo> {
382    private int index = 0;
383    private final List<Iterator<BlockInfo>> iterators;
384    
385    private BlockIterator(final DatanodeStorageInfo... storages) {
386      List<Iterator<BlockInfo>> iterators = new ArrayList<Iterator<BlockInfo>>();
387      for (DatanodeStorageInfo e : storages) {
388        iterators.add(e.getBlockIterator());
389      }
390      this.iterators = Collections.unmodifiableList(iterators);
391    }
392
393    @Override
394    public boolean hasNext() {
395      update();
396      return !iterators.isEmpty() && iterators.get(index).hasNext();
397    }
398
399    @Override
400    public BlockInfo next() {
401      update();
402      return iterators.get(index).next();
403    }
404    
405    @Override
406    public void remove() {
407      throw new UnsupportedOperationException("Remove unsupported.");
408    }
409    
410    private void update() {
411      while(index < iterators.size() - 1 && !iterators.get(index).hasNext()) {
412        index++;
413      }
414    }
415  }
416
417  Iterator<BlockInfo> getBlockIterator() {
418    return new BlockIterator(getStorageInfos());
419  }
420  Iterator<BlockInfo> getBlockIterator(final String storageID) {
421    return new BlockIterator(getStorageInfo(storageID));
422  }
423
424  /**
425   * Store block replication work.
426   */
427  void addBlockToBeReplicated(Block block, DatanodeStorageInfo[] targets) {
428    assert(block != null && targets != null && targets.length > 0);
429    replicateBlocks.offer(new BlockTargetPair(block, targets));
430  }
431
432  /**
433   * Store block recovery work.
434   */
435  void addBlockToBeRecovered(BlockInfoUnderConstruction block) {
436    if(recoverBlocks.contains(block)) {
437      // this prevents adding the same block twice to the recovery queue
438      BlockManager.LOG.info(block + " is already in the recovery queue");
439      return;
440    }
441    recoverBlocks.offer(block);
442  }
443
444  /**
445   * Store block invalidation work.
446   */
447  void addBlocksToBeInvalidated(List<Block> blocklist) {
448    assert(blocklist != null && blocklist.size() > 0);
449    synchronized (invalidateBlocks) {
450      for(Block blk : blocklist) {
451        invalidateBlocks.add(blk);
452      }
453    }
454  }
455  
456  /**
457   * The number of work items that are pending to be replicated
458   */
459  int getNumberOfBlocksToBeReplicated() {
460    return replicateBlocks.size();
461  }
462
463  /**
464   * The number of block invalidation items that are pending to 
465   * be sent to the datanode
466   */
467  int getNumberOfBlocksToBeInvalidated() {
468    synchronized (invalidateBlocks) {
469      return invalidateBlocks.size();
470    }
471  }
472
473  public List<BlockTargetPair> getReplicationCommand(int maxTransfers) {
474    return replicateBlocks.poll(maxTransfers);
475  }
476
477  public BlockInfoUnderConstruction[] getLeaseRecoveryCommand(int maxTransfers) {
478    List<BlockInfoUnderConstruction> blocks = recoverBlocks.poll(maxTransfers);
479    if(blocks == null)
480      return null;
481    return blocks.toArray(new BlockInfoUnderConstruction[blocks.size()]);
482  }
483
484  /**
485   * Remove the specified number of blocks to be invalidated
486   */
487  public Block[] getInvalidateBlocks(int maxblocks) {
488    synchronized (invalidateBlocks) {
489      Block[] deleteList = invalidateBlocks.pollToArray(new Block[Math.min(
490          invalidateBlocks.size(), maxblocks)]);
491      return deleteList.length == 0 ? null : deleteList;
492    }
493  }
494
495  /**
496   * @return Approximate number of blocks currently scheduled to be written 
497   * to this datanode.
498   */
499  public int getBlocksScheduled() {
500    return currApproxBlocksScheduled + prevApproxBlocksScheduled;
501  }
502
503  /** Increment the number of blocks scheduled. */
504  void incrementBlocksScheduled() {
505    currApproxBlocksScheduled++;
506  }
507  
508  /** Decrement the number of blocks scheduled. */
509  void decrementBlocksScheduled() {
510    if (prevApproxBlocksScheduled > 0) {
511      prevApproxBlocksScheduled--;
512    } else if (currApproxBlocksScheduled > 0) {
513      currApproxBlocksScheduled--;
514    } 
515    // its ok if both counters are zero.
516  }
517  
518  /** Adjusts curr and prev number of blocks scheduled every few minutes. */
519  private void rollBlocksScheduled(long now) {
520    if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) {
521      prevApproxBlocksScheduled = currApproxBlocksScheduled;
522      currApproxBlocksScheduled = 0;
523      lastBlocksScheduledRollTime = now;
524    }
525  }
526  
527  @Override
528  public int hashCode() {
529    // Super implementation is sufficient
530    return super.hashCode();
531  }
532  
533  @Override
534  public boolean equals(Object obj) {
535    // Sufficient to use super equality as datanodes are uniquely identified
536    // by DatanodeID
537    return (this == obj) || super.equals(obj);
538  }
539
540  /** Decommissioning status */
541  public class DecommissioningStatus {
542    private int underReplicatedBlocks;
543    private int decommissionOnlyReplicas;
544    private int underReplicatedInOpenFiles;
545    private long startTime;
546    
547    synchronized void set(int underRep,
548        int onlyRep, int underConstruction) {
549      if (isDecommissionInProgress() == false) {
550        return;
551      }
552      underReplicatedBlocks = underRep;
553      decommissionOnlyReplicas = onlyRep;
554      underReplicatedInOpenFiles = underConstruction;
555    }
556
557    /** @return the number of under-replicated blocks */
558    public synchronized int getUnderReplicatedBlocks() {
559      if (isDecommissionInProgress() == false) {
560        return 0;
561      }
562      return underReplicatedBlocks;
563    }
564    /** @return the number of decommission-only replicas */
565    public synchronized int getDecommissionOnlyReplicas() {
566      if (isDecommissionInProgress() == false) {
567        return 0;
568      }
569      return decommissionOnlyReplicas;
570    }
571    /** @return the number of under-replicated blocks in open files */
572    public synchronized int getUnderReplicatedInOpenFiles() {
573      if (isDecommissionInProgress() == false) {
574        return 0;
575      }
576      return underReplicatedInOpenFiles;
577    }
578    /** Set start time */
579    public synchronized void setStartTime(long time) {
580      startTime = time;
581    }
582    /** @return start time */
583    public synchronized long getStartTime() {
584      if (isDecommissionInProgress() == false) {
585        return 0;
586      }
587      return startTime;
588    }
589  }  // End of class DecommissioningStatus
590
591  /**
592   * Set the flag to indicate if this datanode is disallowed from communicating
593   * with the namenode.
594   */
595  public void setDisallowed(boolean flag) {
596    disallowed = flag;
597  }
598  /** Is the datanode disallowed from communicating with the namenode? */
599  public boolean isDisallowed() {
600    return disallowed;
601  }
602
603  /**
604   * @return number of failed volumes in the datanode.
605   */
606  public int getVolumeFailures() {
607    return volumeFailures;
608  }
609
610  /**
611   * @param nodeReg DatanodeID to update registration for.
612   */
613  @Override
614  public void updateRegInfo(DatanodeID nodeReg) {
615    super.updateRegInfo(nodeReg);
616    
617    // must re-process IBR after re-registration
618    for(DatanodeStorageInfo storage : getStorageInfos()) {
619      storage.setBlockReportCount(0);
620    }
621  }
622
623  /**
624   * @return balancer bandwidth in bytes per second for this datanode
625   */
626  public long getBalancerBandwidth() {
627    return this.bandwidth;
628  }
629
630  /**
631   * @param bandwidth balancer bandwidth in bytes per second for this datanode
632   */
633  public void setBalancerBandwidth(long bandwidth) {
634    this.bandwidth = bandwidth;
635  }
636
637  @Override
638  public String dumpDatanode() {
639    StringBuilder sb = new StringBuilder(super.dumpDatanode());
640    int repl = replicateBlocks.size();
641    if (repl > 0) {
642      sb.append(" ").append(repl).append(" blocks to be replicated;");
643    }
644    int inval = invalidateBlocks.size();
645    if (inval > 0) {
646      sb.append(" ").append(inval).append(" blocks to be invalidated;");      
647    }
648    int recover = recoverBlocks.size();
649    if (recover > 0) {
650      sb.append(" ").append(recover).append(" blocks to be recovered;");
651    }
652    return sb.toString();
653  }
654
655  DatanodeStorageInfo updateStorage(DatanodeStorage s) {
656    synchronized (storageMap) {
657      DatanodeStorageInfo storage = storageMap.get(s.getStorageID());
658      if (storage == null) {
659        LOG.info("Adding new storage ID " + s.getStorageID() +
660                 " for DN " + getXferAddr());
661        storage = new DatanodeStorageInfo(this, s);
662        storageMap.put(s.getStorageID(), storage);
663      }
664      return storage;
665    }
666  }
667
668  /**
669   * @return   The time at which we last sent caching directives to this 
670   *           DataNode, in monotonic milliseconds.
671   */
672  public long getLastCachingDirectiveSentTimeMs() {
673    return this.lastCachingDirectiveSentTimeMs;
674  }
675
676  /**
677   * @param time  The time at which we last sent caching directives to this 
678   *              DataNode, in monotonic milliseconds.
679   */
680  public void setLastCachingDirectiveSentTimeMs(long time) {
681    this.lastCachingDirectiveSentTimeMs = time;
682  }
683}
684