001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.blockmanagement;
019
020import java.util.ArrayList;
021import java.util.Collection;
022import java.util.HashMap;
023import java.util.List;
024import java.util.Map;
025import java.util.Set;
026
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.hdfs.DFSUtil;
029import org.apache.hadoop.hdfs.StorageType;
030import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
031import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
032import org.apache.hadoop.net.NetworkTopology;
033import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
034import org.apache.hadoop.net.Node;
035import org.apache.hadoop.net.NodeBase;
036
037/** The class is responsible for choosing the desired number of targets
038 * for placing block replicas on environment with node-group layer.
039 * The replica placement strategy is adjusted to:
040 * If the writer is on a datanode, the 1st replica is placed on the local 
041 *     node (or local node-group), otherwise a random datanode. 
042 * The 2nd replica is placed on a datanode that is on a different rack with 1st
043 *     replica node. 
044 * The 3rd replica is placed on a datanode which is on a different node-group
045 *     but the same rack as the second replica node.
046 */
047public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
048
049  protected BlockPlacementPolicyWithNodeGroup(Configuration conf,  FSClusterStats stats,
050      NetworkTopology clusterMap) {
051    initialize(conf, stats, clusterMap);
052  }
053
054  protected BlockPlacementPolicyWithNodeGroup() {
055  }
056
057  public void initialize(Configuration conf,  FSClusterStats stats,
058          NetworkTopology clusterMap) {
059    super.initialize(conf, stats, clusterMap);
060  }
061
062  /** choose local node of localMachine as the target.
063   * if localMachine is not available, choose a node on the same nodegroup or 
064   * rack instead.
065   * @return the chosen node
066   */
067  @Override
068  protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
069      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
070      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
071      StorageType storageType) throws NotEnoughReplicasException {
072    // if no local machine, randomly choose one node
073    if (localMachine == null)
074      return chooseRandom(NodeBase.ROOT, excludedNodes, 
075          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
076
077    // otherwise try local machine first
078    if (localMachine instanceof DatanodeDescriptor) {
079      DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
080      if (excludedNodes.add(localMachine)) { // was not in the excluded list
081        for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
082            localDataNode.getStorageInfos())) {
083          if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
084              maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
085            return localStorage;
086          }
087        }
088      }
089    }
090
091    // try a node on local node group
092    DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
093        (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
094        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
095    if (chosenStorage != null) {
096      return chosenStorage;
097    }
098    // try a node on local rack
099    return chooseLocalRack(localMachine, excludedNodes, 
100        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
101  }
102
103  /** @return the node of the second replica */
104  private static DatanodeDescriptor secondNode(Node localMachine,
105      List<DatanodeStorageInfo> results) {
106    // find the second replica
107    for(DatanodeStorageInfo nextStorage : results) {
108      DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
109      if (nextNode != localMachine) {
110        return nextNode;
111      }
112    }
113    return null;
114  }
115
116  @Override
117  protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
118      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
119      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
120      StorageType storageType) throws NotEnoughReplicasException {
121    // no local machine, so choose a random machine
122    if (localMachine == null) {
123      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
124          maxNodesPerRack, results, avoidStaleNodes, storageType);
125    }
126
127    // choose one from the local rack, but off-nodegroup
128    try {
129      final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
130      return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
131          results, avoidStaleNodes, storageType);
132    } catch (NotEnoughReplicasException e1) {
133      // find the second replica
134      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
135      if (newLocal != null) {
136        try {
137          return chooseRandom(
138              clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
139              blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
140        } catch(NotEnoughReplicasException e2) {
141          //otherwise randomly choose one from the network
142          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
143              maxNodesPerRack, results, avoidStaleNodes, storageType);
144        }
145      } else {
146        //otherwise randomly choose one from the network
147        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
148            maxNodesPerRack, results, avoidStaleNodes, storageType);
149      }
150    }
151  }
152
153  /**
154   * {@inheritDoc}
155   */
156  @Override
157  protected void chooseRemoteRack(int numOfReplicas,
158      DatanodeDescriptor localMachine, Set<Node> excludedNodes,
159      long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
160      boolean avoidStaleNodes, StorageType storageType)
161          throws NotEnoughReplicasException {
162    int oldNumOfReplicas = results.size();
163
164    final String rackLocation = NetworkTopology.getFirstHalf(
165        localMachine.getNetworkLocation());
166    try {
167      // randomly choose from remote racks
168      chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
169          maxReplicasPerRack, results, avoidStaleNodes, storageType);
170    } catch (NotEnoughReplicasException e) {
171      // fall back to the local rack
172      chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
173          rackLocation, excludedNodes, blocksize,
174          maxReplicasPerRack, results, avoidStaleNodes, storageType);
175    }
176  }
177
178  /* choose one node from the nodegroup that <i>localMachine</i> is on.
179   * if no such node is available, choose one node from the nodegroup where
180   * a second replica is on.
181   * if still no such node is available, choose a random node in the cluster.
182   * @return the chosen node
183   */
184  private DatanodeStorageInfo chooseLocalNodeGroup(
185      NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
186      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
187      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
188      StorageType storageType) throws NotEnoughReplicasException {
189    // no local machine, so choose a random machine
190    if (localMachine == null) {
191      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
192          maxNodesPerRack, results, avoidStaleNodes, storageType);
193    }
194
195    // choose one from the local node group
196    try {
197      return chooseRandom(
198          clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
199          excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
200          storageType);
201    } catch (NotEnoughReplicasException e1) {
202      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
203      if (newLocal != null) {
204        try {
205          return chooseRandom(
206              clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
207              excludedNodes, blocksize, maxNodesPerRack, results,
208              avoidStaleNodes, storageType);
209        } catch(NotEnoughReplicasException e2) {
210          //otherwise randomly choose one from the network
211          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
212              maxNodesPerRack, results, avoidStaleNodes, storageType);
213        }
214      } else {
215        //otherwise randomly choose one from the network
216        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
217            maxNodesPerRack, results, avoidStaleNodes, storageType);
218      }
219    }
220  }
221
222  @Override
223  protected String getRack(final DatanodeInfo cur) {
224    String nodeGroupString = cur.getNetworkLocation();
225    return NetworkTopology.getFirstHalf(nodeGroupString);
226  }
227  
228  /**
229   * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
230   * into <i>excludeNodes</i> as replica should not be duplicated for nodes 
231   * within the same nodegroup
232   * @return number of new excluded nodes
233   */
234  @Override
235  protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
236      Set<Node> excludedNodes) {
237    int countOfExcludedNodes = 0;
238    String nodeGroupScope = chosenNode.getNetworkLocation();
239    List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
240    for (Node leafNode : leafNodes) {
241      if (excludedNodes.add(leafNode)) {
242        // not a existing node in excludedNodes
243        countOfExcludedNodes++;
244      }
245    }
246    return countOfExcludedNodes;
247  }
248
249  /**
250   * Pick up replica node set for deleting replica as over-replicated. 
251   * First set contains replica nodes on rack with more than one
252   * replica while second set contains remaining replica nodes.
253   * If first is not empty, divide first set into two subsets:
254   *   moreThanOne contains nodes on nodegroup with more than one replica
255   *   exactlyOne contains the remaining nodes in first set
256   * then pickup priSet if not empty.
257   * If first is empty, then pick second.
258   */
259  @Override
260  public Collection<DatanodeDescriptor> pickupReplicaSet(
261      Collection<DatanodeDescriptor> first,
262      Collection<DatanodeDescriptor> second) {
263    // If no replica within same rack, return directly.
264    if (first.isEmpty()) {
265      return second;
266    }
267    // Split data nodes in the first set into two sets, 
268    // moreThanOne contains nodes on nodegroup with more than one replica
269    // exactlyOne contains the remaining nodes
270    Map<String, List<DatanodeDescriptor>> nodeGroupMap = 
271        new HashMap<String, List<DatanodeDescriptor>>();
272    
273    for(DatanodeDescriptor node : first) {
274      final String nodeGroupName = 
275          NetworkTopology.getLastHalf(node.getNetworkLocation());
276      List<DatanodeDescriptor> datanodeList = 
277          nodeGroupMap.get(nodeGroupName);
278      if (datanodeList == null) {
279        datanodeList = new ArrayList<DatanodeDescriptor>();
280        nodeGroupMap.put(nodeGroupName, datanodeList);
281      }
282      datanodeList.add(node);
283    }
284    
285    final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
286    final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
287    // split nodes into two sets
288    for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
289      if (datanodeList.size() == 1 ) {
290        // exactlyOne contains nodes on nodegroup with exactly one replica
291        exactlyOne.add(datanodeList.get(0));
292      } else {
293        // moreThanOne contains nodes on nodegroup with more than one replica
294        moreThanOne.addAll(datanodeList);
295      }
296    }
297    
298    return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
299  }
300  
301}