001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.blockmanagement; 019 020import java.util.ArrayList; 021import java.util.Collection; 022import java.util.HashMap; 023import java.util.List; 024import java.util.Map; 025import java.util.Set; 026 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.hdfs.DFSUtil; 029import org.apache.hadoop.hdfs.StorageType; 030import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 031import org.apache.hadoop.hdfs.server.namenode.FSClusterStats; 032import org.apache.hadoop.net.NetworkTopology; 033import org.apache.hadoop.net.NetworkTopologyWithNodeGroup; 034import org.apache.hadoop.net.Node; 035import org.apache.hadoop.net.NodeBase; 036 037/** The class is responsible for choosing the desired number of targets 038 * for placing block replicas on environment with node-group layer. 039 * The replica placement strategy is adjusted to: 040 * If the writer is on a datanode, the 1st replica is placed on the local 041 * node (or local node-group), otherwise a random datanode. 042 * The 2nd replica is placed on a datanode that is on a different rack with 1st 043 * replica node. 044 * The 3rd replica is placed on a datanode which is on a different node-group 045 * but the same rack as the second replica node. 046 */ 047public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault { 048 049 protected BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats, 050 NetworkTopology clusterMap) { 051 initialize(conf, stats, clusterMap); 052 } 053 054 protected BlockPlacementPolicyWithNodeGroup() { 055 } 056 057 public void initialize(Configuration conf, FSClusterStats stats, 058 NetworkTopology clusterMap) { 059 super.initialize(conf, stats, clusterMap); 060 } 061 062 /** choose local node of localMachine as the target. 063 * if localMachine is not available, choose a node on the same nodegroup or 064 * rack instead. 065 * @return the chosen node 066 */ 067 @Override 068 protected DatanodeStorageInfo chooseLocalStorage(Node localMachine, 069 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 070 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 071 StorageType storageType) throws NotEnoughReplicasException { 072 // if no local machine, randomly choose one node 073 if (localMachine == null) 074 return chooseRandom(NodeBase.ROOT, excludedNodes, 075 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 076 077 // otherwise try local machine first 078 if (localMachine instanceof DatanodeDescriptor) { 079 DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine; 080 if (excludedNodes.add(localMachine)) { // was not in the excluded list 081 for(DatanodeStorageInfo localStorage : DFSUtil.shuffle( 082 localDataNode.getStorageInfos())) { 083 if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize, 084 maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) { 085 return localStorage; 086 } 087 } 088 } 089 } 090 091 // try a node on local node group 092 DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup( 093 (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 094 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 095 if (chosenStorage != null) { 096 return chosenStorage; 097 } 098 // try a node on local rack 099 return chooseLocalRack(localMachine, excludedNodes, 100 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 101 } 102 103 /** @return the node of the second replica */ 104 private static DatanodeDescriptor secondNode(Node localMachine, 105 List<DatanodeStorageInfo> results) { 106 // find the second replica 107 for(DatanodeStorageInfo nextStorage : results) { 108 DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor(); 109 if (nextNode != localMachine) { 110 return nextNode; 111 } 112 } 113 return null; 114 } 115 116 @Override 117 protected DatanodeStorageInfo chooseLocalRack(Node localMachine, 118 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 119 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 120 StorageType storageType) throws NotEnoughReplicasException { 121 // no local machine, so choose a random machine 122 if (localMachine == null) { 123 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 124 maxNodesPerRack, results, avoidStaleNodes, storageType); 125 } 126 127 // choose one from the local rack, but off-nodegroup 128 try { 129 final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation()); 130 return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack, 131 results, avoidStaleNodes, storageType); 132 } catch (NotEnoughReplicasException e1) { 133 // find the second replica 134 final DatanodeDescriptor newLocal = secondNode(localMachine, results); 135 if (newLocal != null) { 136 try { 137 return chooseRandom( 138 clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes, 139 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 140 } catch(NotEnoughReplicasException e2) { 141 //otherwise randomly choose one from the network 142 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 143 maxNodesPerRack, results, avoidStaleNodes, storageType); 144 } 145 } else { 146 //otherwise randomly choose one from the network 147 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 148 maxNodesPerRack, results, avoidStaleNodes, storageType); 149 } 150 } 151 } 152 153 /** 154 * {@inheritDoc} 155 */ 156 @Override 157 protected void chooseRemoteRack(int numOfReplicas, 158 DatanodeDescriptor localMachine, Set<Node> excludedNodes, 159 long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results, 160 boolean avoidStaleNodes, StorageType storageType) 161 throws NotEnoughReplicasException { 162 int oldNumOfReplicas = results.size(); 163 164 final String rackLocation = NetworkTopology.getFirstHalf( 165 localMachine.getNetworkLocation()); 166 try { 167 // randomly choose from remote racks 168 chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize, 169 maxReplicasPerRack, results, avoidStaleNodes, storageType); 170 } catch (NotEnoughReplicasException e) { 171 // fall back to the local rack 172 chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas), 173 rackLocation, excludedNodes, blocksize, 174 maxReplicasPerRack, results, avoidStaleNodes, storageType); 175 } 176 } 177 178 /* choose one node from the nodegroup that <i>localMachine</i> is on. 179 * if no such node is available, choose one node from the nodegroup where 180 * a second replica is on. 181 * if still no such node is available, choose a random node in the cluster. 182 * @return the chosen node 183 */ 184 private DatanodeStorageInfo chooseLocalNodeGroup( 185 NetworkTopologyWithNodeGroup clusterMap, Node localMachine, 186 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 187 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 188 StorageType storageType) throws NotEnoughReplicasException { 189 // no local machine, so choose a random machine 190 if (localMachine == null) { 191 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 192 maxNodesPerRack, results, avoidStaleNodes, storageType); 193 } 194 195 // choose one from the local node group 196 try { 197 return chooseRandom( 198 clusterMap.getNodeGroup(localMachine.getNetworkLocation()), 199 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, 200 storageType); 201 } catch (NotEnoughReplicasException e1) { 202 final DatanodeDescriptor newLocal = secondNode(localMachine, results); 203 if (newLocal != null) { 204 try { 205 return chooseRandom( 206 clusterMap.getNodeGroup(newLocal.getNetworkLocation()), 207 excludedNodes, blocksize, maxNodesPerRack, results, 208 avoidStaleNodes, storageType); 209 } catch(NotEnoughReplicasException e2) { 210 //otherwise randomly choose one from the network 211 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 212 maxNodesPerRack, results, avoidStaleNodes, storageType); 213 } 214 } else { 215 //otherwise randomly choose one from the network 216 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 217 maxNodesPerRack, results, avoidStaleNodes, storageType); 218 } 219 } 220 } 221 222 @Override 223 protected String getRack(final DatanodeInfo cur) { 224 String nodeGroupString = cur.getNetworkLocation(); 225 return NetworkTopology.getFirstHalf(nodeGroupString); 226 } 227 228 /** 229 * Find other nodes in the same nodegroup of <i>localMachine</i> and add them 230 * into <i>excludeNodes</i> as replica should not be duplicated for nodes 231 * within the same nodegroup 232 * @return number of new excluded nodes 233 */ 234 @Override 235 protected int addToExcludedNodes(DatanodeDescriptor chosenNode, 236 Set<Node> excludedNodes) { 237 int countOfExcludedNodes = 0; 238 String nodeGroupScope = chosenNode.getNetworkLocation(); 239 List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope); 240 for (Node leafNode : leafNodes) { 241 if (excludedNodes.add(leafNode)) { 242 // not a existing node in excludedNodes 243 countOfExcludedNodes++; 244 } 245 } 246 return countOfExcludedNodes; 247 } 248 249 /** 250 * Pick up replica node set for deleting replica as over-replicated. 251 * First set contains replica nodes on rack with more than one 252 * replica while second set contains remaining replica nodes. 253 * If first is not empty, divide first set into two subsets: 254 * moreThanOne contains nodes on nodegroup with more than one replica 255 * exactlyOne contains the remaining nodes in first set 256 * then pickup priSet if not empty. 257 * If first is empty, then pick second. 258 */ 259 @Override 260 public Collection<DatanodeDescriptor> pickupReplicaSet( 261 Collection<DatanodeDescriptor> first, 262 Collection<DatanodeDescriptor> second) { 263 // If no replica within same rack, return directly. 264 if (first.isEmpty()) { 265 return second; 266 } 267 // Split data nodes in the first set into two sets, 268 // moreThanOne contains nodes on nodegroup with more than one replica 269 // exactlyOne contains the remaining nodes 270 Map<String, List<DatanodeDescriptor>> nodeGroupMap = 271 new HashMap<String, List<DatanodeDescriptor>>(); 272 273 for(DatanodeDescriptor node : first) { 274 final String nodeGroupName = 275 NetworkTopology.getLastHalf(node.getNetworkLocation()); 276 List<DatanodeDescriptor> datanodeList = 277 nodeGroupMap.get(nodeGroupName); 278 if (datanodeList == null) { 279 datanodeList = new ArrayList<DatanodeDescriptor>(); 280 nodeGroupMap.put(nodeGroupName, datanodeList); 281 } 282 datanodeList.add(node); 283 } 284 285 final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>(); 286 final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>(); 287 // split nodes into two sets 288 for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) { 289 if (datanodeList.size() == 1 ) { 290 // exactlyOne contains nodes on nodegroup with exactly one replica 291 exactlyOne.add(datanodeList.get(0)); 292 } else { 293 // moreThanOne contains nodes on nodegroup with more than one replica 294 moreThanOne.addAll(datanodeList); 295 } 296 } 297 298 return moreThanOne.isEmpty()? exactlyOne : moreThanOne; 299 } 300 301}