001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY;
024
025import java.io.IOException;
026import java.util.ArrayList;
027import java.util.List;
028import java.util.Random;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.conf.Configurable;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
035
036/**
037 * A DN volume choosing policy which takes into account the amount of free
038 * space on each of the available volumes when considering where to assign a
039 * new replica allocation. By default this policy prefers assigning replicas to
040 * those volumes with more available free space, so as to over time balance the
041 * available space of all the volumes within a DN.
042 */
043public class AvailableSpaceVolumeChoosingPolicy<V extends FsVolumeSpi>
044    implements VolumeChoosingPolicy<V>, Configurable {
045  
046  private static final Log LOG = LogFactory.getLog(AvailableSpaceVolumeChoosingPolicy.class);
047  
048  private static final Random RAND = new Random();
049  
050  private long balancedSpaceThreshold = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
051  private float balancedPreferencePercent = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
052
053  @Override
054  public synchronized void setConf(Configuration conf) {
055    balancedSpaceThreshold = conf.getLong(
056        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,
057        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT);
058    balancedPreferencePercent = conf.getFloat(
059        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY,
060        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT);
061    
062    LOG.info("Available space volume choosing policy initialized: " +
063        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY +
064        " = " + balancedSpaceThreshold + ", " +
065        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
066        " = " + balancedPreferencePercent);
067
068    if (balancedPreferencePercent > 1.0) {
069      LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
070               " is greater than 1.0 but should be in the range 0.0 - 1.0");
071    }
072
073    if (balancedPreferencePercent < 0.5) {
074      LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
075               " is less than 0.5 so volumes with less available disk space will receive more block allocations");
076    }
077  }
078  
079  @Override
080  public synchronized Configuration getConf() {
081    // Nothing to do. Only added to fulfill the Configurable contract.
082    return null;
083  }
084  
085  private VolumeChoosingPolicy<V> roundRobinPolicyBalanced =
086      new RoundRobinVolumeChoosingPolicy<V>();
087  private VolumeChoosingPolicy<V> roundRobinPolicyHighAvailable =
088      new RoundRobinVolumeChoosingPolicy<V>();
089  private VolumeChoosingPolicy<V> roundRobinPolicyLowAvailable =
090      new RoundRobinVolumeChoosingPolicy<V>();
091
092  @Override
093  public synchronized V chooseVolume(List<V> volumes,
094      final long replicaSize) throws IOException {
095    if (volumes.size() < 1) {
096      throw new DiskOutOfSpaceException("No more available volumes");
097    }
098    
099    AvailableSpaceVolumeList volumesWithSpaces =
100        new AvailableSpaceVolumeList(volumes);
101    
102    if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) {
103      // If they're actually not too far out of whack, fall back on pure round
104      // robin.
105      V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize);
106      if (LOG.isDebugEnabled()) {
107        LOG.debug("All volumes are within the configured free space balance " +
108            "threshold. Selecting " + volume + " for write of block size " +
109            replicaSize);
110      }
111      return volume;
112    } else {
113      V volume = null;
114      // If none of the volumes with low free space have enough space for the
115      // replica, always try to choose a volume with a lot of free space.
116      long mostAvailableAmongLowVolumes = volumesWithSpaces
117          .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace();
118      
119      List<V> highAvailableVolumes = extractVolumesFromPairs(
120          volumesWithSpaces.getVolumesWithHighAvailableSpace());
121      List<V> lowAvailableVolumes = extractVolumesFromPairs(
122          volumesWithSpaces.getVolumesWithLowAvailableSpace());
123      
124      float preferencePercentScaler =
125          (highAvailableVolumes.size() * balancedPreferencePercent) +
126          (lowAvailableVolumes.size() * (1 - balancedPreferencePercent));
127      float scaledPreferencePercent =
128          (highAvailableVolumes.size() * balancedPreferencePercent) /
129          preferencePercentScaler;
130      if (mostAvailableAmongLowVolumes < replicaSize ||
131          RAND.nextFloat() < scaledPreferencePercent) {
132        volume = roundRobinPolicyHighAvailable.chooseVolume(
133            highAvailableVolumes,
134            replicaSize);
135        if (LOG.isDebugEnabled()) {
136          LOG.debug("Volumes are imbalanced. Selecting " + volume +
137              " from high available space volumes for write of block size "
138              + replicaSize);
139        }
140      } else {
141        volume = roundRobinPolicyLowAvailable.chooseVolume(
142            lowAvailableVolumes,
143            replicaSize);
144        if (LOG.isDebugEnabled()) {
145          LOG.debug("Volumes are imbalanced. Selecting " + volume +
146              " from low available space volumes for write of block size "
147              + replicaSize);
148        }
149      }
150      return volume;
151    }
152  }
153  
154  /**
155   * Used to keep track of the list of volumes we're choosing from.
156   */
157  private class AvailableSpaceVolumeList {
158    private final List<AvailableSpaceVolumePair> volumes;
159    
160    public AvailableSpaceVolumeList(List<V> volumes) throws IOException {
161      this.volumes = new ArrayList<AvailableSpaceVolumePair>();
162      for (V volume : volumes) {
163        this.volumes.add(new AvailableSpaceVolumePair(volume));
164      }
165    }
166    
167    /**
168     * Check if the available space on all the volumes is roughly equal.
169     * 
170     * @param volumes the volumes to check
171     * @return true if all volumes' free space is within the configured threshold,
172     *         false otherwise.
173     * @throws IOException
174     *           in the event of error checking amount of available space
175     */
176    public boolean areAllVolumesWithinFreeSpaceThreshold() {
177      long leastAvailable = Long.MAX_VALUE;
178      long mostAvailable = 0;
179      for (AvailableSpaceVolumePair volume : volumes) {
180        leastAvailable = Math.min(leastAvailable, volume.getAvailable());
181        mostAvailable = Math.max(mostAvailable, volume.getAvailable());
182      }
183      return (mostAvailable - leastAvailable) < balancedSpaceThreshold;
184    }
185    
186    /**
187     * @return the minimum amount of space available on a single volume,
188     *         across all volumes.
189     */
190    private long getLeastAvailableSpace() {
191      long leastAvailable = Long.MAX_VALUE;
192      for (AvailableSpaceVolumePair volume : volumes) {
193        leastAvailable = Math.min(leastAvailable, volume.getAvailable());
194      }
195      return leastAvailable;
196    }
197    
198    /**
199     * @return the maximum amount of space available across volumes with low space.
200     */
201    public long getMostAvailableSpaceAmongVolumesWithLowAvailableSpace() {
202      long mostAvailable = Long.MIN_VALUE;
203      for (AvailableSpaceVolumePair volume : getVolumesWithLowAvailableSpace()) {
204        mostAvailable = Math.max(mostAvailable, volume.getAvailable());
205      }
206      return mostAvailable;
207    }
208    
209    /**
210     * @return the list of volumes with relatively low available space.
211     */
212    public List<AvailableSpaceVolumePair> getVolumesWithLowAvailableSpace() {
213      long leastAvailable = getLeastAvailableSpace();
214      List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
215      for (AvailableSpaceVolumePair volume : volumes) {
216        if (volume.getAvailable() <= leastAvailable + balancedSpaceThreshold) {
217          ret.add(volume);
218        }
219      }
220      return ret;
221    }
222    
223    /**
224     * @return the list of volumes with a lot of available space.
225     */
226    public List<AvailableSpaceVolumePair> getVolumesWithHighAvailableSpace() {
227      long leastAvailable = getLeastAvailableSpace();
228      List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
229      for (AvailableSpaceVolumePair volume : volumes) {
230        if (volume.getAvailable() > leastAvailable + balancedSpaceThreshold) {
231          ret.add(volume);
232        }
233      }
234      return ret;
235    }
236    
237  }
238  
239  /**
240   * Used so that we only check the available space on a given volume once, at
241   * the beginning of {@link AvailableSpaceVolumeChoosingPolicy#chooseVolume(List, long)}.
242   */
243  private class AvailableSpaceVolumePair {
244    private final V volume;
245    private final long availableSpace;
246    
247    public AvailableSpaceVolumePair(V volume) throws IOException {
248      this.volume = volume;
249      this.availableSpace = volume.getAvailable();
250    }
251    
252    public long getAvailable() {
253      return availableSpace;
254    }
255    
256    public V getVolume() {
257      return volume;
258    }
259  }
260  
261  private List<V> extractVolumesFromPairs(List<AvailableSpaceVolumePair> volumes) {
262    List<V> ret = new ArrayList<V>();
263    for (AvailableSpaceVolumePair volume : volumes) {
264      ret.add(volume.getVolume());
265    }
266    return ret;
267  }
268
269}