001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
024import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
088import static org.apache.hadoop.util.Time.now;
089
090import java.io.BufferedWriter;
091import java.io.ByteArrayInputStream;
092import java.io.DataInput;
093import java.io.DataInputStream;
094import java.io.DataOutputStream;
095import java.io.File;
096import java.io.FileNotFoundException;
097import java.io.FileOutputStream;
098import java.io.IOException;
099import java.io.OutputStreamWriter;
100import java.io.PrintWriter;
101import java.io.StringWriter;
102import java.lang.management.ManagementFactory;
103import java.net.InetAddress;
104import java.net.URI;
105import java.util.ArrayList;
106import java.util.Arrays;
107import java.util.Collection;
108import java.util.Collections;
109import java.util.Date;
110import java.util.EnumSet;
111import java.util.HashMap;
112import java.util.HashSet;
113import java.util.Iterator;
114import java.util.LinkedHashSet;
115import java.util.List;
116import java.util.Map;
117import java.util.Set;
118import java.util.concurrent.TimeUnit;
119import java.util.concurrent.locks.ReentrantReadWriteLock;
120
121import javax.management.NotCompliantMBeanException;
122import javax.management.ObjectName;
123import javax.management.StandardMBean;
124
125import org.apache.commons.logging.Log;
126import org.apache.commons.logging.LogFactory;
127import org.apache.commons.logging.impl.Log4JLogger;
128import org.apache.hadoop.HadoopIllegalArgumentException;
129import org.apache.hadoop.classification.InterfaceAudience;
130import org.apache.hadoop.conf.Configuration;
131import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
132import org.apache.hadoop.fs.CacheFlag;
133import org.apache.hadoop.fs.ContentSummary;
134import org.apache.hadoop.fs.CreateFlag;
135import org.apache.hadoop.fs.DirectoryListingStartAfterNotFoundException;
136import org.apache.hadoop.fs.FileAlreadyExistsException;
137import org.apache.hadoop.fs.FileStatus;
138import org.apache.hadoop.fs.FileSystem;
139import org.apache.hadoop.fs.FsServerDefaults;
140import org.apache.hadoop.fs.InvalidPathException;
141import org.apache.hadoop.fs.Options;
142import org.apache.hadoop.fs.Options.Rename;
143import org.apache.hadoop.fs.ParentNotDirectoryException;
144import org.apache.hadoop.fs.Path;
145import org.apache.hadoop.fs.UnresolvedLinkException;
146import org.apache.hadoop.fs.permission.FsAction;
147import org.apache.hadoop.fs.permission.FsPermission;
148import org.apache.hadoop.fs.permission.PermissionStatus;
149import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
150import org.apache.hadoop.ha.ServiceFailedException;
151import org.apache.hadoop.hdfs.DFSConfigKeys;
152import org.apache.hadoop.hdfs.DFSUtil;
153import org.apache.hadoop.hdfs.HAUtil;
154import org.apache.hadoop.hdfs.HdfsConfiguration;
155import org.apache.hadoop.hdfs.StorageType;
156import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
157import org.apache.hadoop.hdfs.protocol.Block;
158import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
159import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
160import org.apache.hadoop.hdfs.protocol.ClientProtocol;
161import org.apache.hadoop.hdfs.protocol.DatanodeID;
162import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
163import org.apache.hadoop.hdfs.protocol.DirectoryListing;
164import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
165import org.apache.hadoop.hdfs.protocol.HdfsConstants;
166import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
167import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
168import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
169import org.apache.hadoop.hdfs.protocol.LocatedBlock;
170import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
171import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
172import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
173import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
174import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
175import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
176import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry;
177import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
178import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
179import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
180import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
181import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
182import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
183import org.apache.hadoop.hdfs.server.blockmanagement.*;
184import org.apache.hadoop.hdfs.server.common.GenerationStamp;
185import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
186import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
187import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
188import org.apache.hadoop.hdfs.server.common.Storage;
189import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
190import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
191import org.apache.hadoop.hdfs.server.common.Util;
192import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
193import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
194import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
195import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
196import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
197import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
198import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
199import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
200import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
201import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
202import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
203import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable.SnapshotDiffInfo;
204import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
205import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
206import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
207import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
208import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
209import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
210import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
211import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
212import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
213import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
214import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
215import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
216import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
217import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
218import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
219import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
220import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
221import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
222import org.apache.hadoop.hdfs.server.protocol.StorageReport;
223import org.apache.hadoop.hdfs.util.ChunkedArrayList;
224import org.apache.hadoop.io.IOUtils;
225import org.apache.hadoop.io.Text;
226import org.apache.hadoop.ipc.RetriableException;
227import org.apache.hadoop.ipc.RetryCache;
228import org.apache.hadoop.ipc.RetryCache.CacheEntry;
229import org.apache.hadoop.ipc.RetryCache.CacheEntryWithPayload;
230import org.apache.hadoop.ipc.Server;
231import org.apache.hadoop.ipc.StandbyException;
232import org.apache.hadoop.metrics2.annotation.Metric;
233import org.apache.hadoop.metrics2.annotation.Metrics;
234import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
235import org.apache.hadoop.metrics2.util.MBeans;
236import org.apache.hadoop.net.NetworkTopology;
237import org.apache.hadoop.net.Node;
238import org.apache.hadoop.security.AccessControlException;
239import org.apache.hadoop.security.UserGroupInformation;
240import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
241import org.apache.hadoop.security.token.SecretManager.InvalidToken;
242import org.apache.hadoop.security.token.Token;
243import org.apache.hadoop.security.token.TokenIdentifier;
244import org.apache.hadoop.security.token.delegation.DelegationKey;
245import org.apache.hadoop.util.Daemon;
246import org.apache.hadoop.util.DataChecksum;
247import org.apache.hadoop.util.StringUtils;
248import org.apache.hadoop.util.Time;
249import org.apache.hadoop.util.VersionInfo;
250import org.apache.log4j.Appender;
251import org.apache.log4j.AsyncAppender;
252import org.apache.log4j.Logger;
253import org.mortbay.util.ajax.JSON;
254
255import com.google.common.annotations.VisibleForTesting;
256import com.google.common.base.Charsets;
257import com.google.common.base.Preconditions;
258import com.google.common.collect.ImmutableMap;
259import com.google.common.collect.Lists;
260
261/***************************************************
262 * FSNamesystem does the actual bookkeeping work for the
263 * DataNode.
264 *
265 * It tracks several important tables.
266 *
267 * 1)  valid fsname --> blocklist  (kept on disk, logged)
268 * 2)  Set of all valid blocks (inverted #1)
269 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
270 * 4)  machine --> blocklist (inverted #2)
271 * 5)  LRU cache of updated-heartbeat machines
272 ***************************************************/
273@InterfaceAudience.Private
274@Metrics(context="dfs")
275public class FSNamesystem implements Namesystem, FSClusterStats,
276    FSNamesystemMBean, NameNodeMXBean {
277  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
278
279  private static final ThreadLocal<StringBuilder> auditBuffer =
280    new ThreadLocal<StringBuilder>() {
281      @Override
282      protected StringBuilder initialValue() {
283        return new StringBuilder();
284      }
285  };
286
287  @VisibleForTesting
288  public boolean isAuditEnabled() {
289    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
290  }
291
292  private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink)
293      throws IOException {
294    return (isAuditEnabled() && isExternalInvocation())
295        ? dir.getFileInfo(path, resolveSymlink) : null;
296  }
297  
298  private void logAuditEvent(boolean succeeded, String cmd, String src)
299      throws IOException {
300    logAuditEvent(succeeded, cmd, src, null, null);
301  }
302  
303  private void logAuditEvent(boolean succeeded, String cmd, String src,
304      String dst, HdfsFileStatus stat) throws IOException {
305    if (isAuditEnabled() && isExternalInvocation()) {
306      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
307                    cmd, src, dst, stat);
308    }
309  }
310
311  private void logAuditEvent(boolean succeeded,
312      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
313      String dst, HdfsFileStatus stat) {
314    FileStatus status = null;
315    if (stat != null) {
316      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
317      Path path = dst != null ? new Path(dst) : new Path(src);
318      status = new FileStatus(stat.getLen(), stat.isDir(),
319          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
320          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
321          stat.getGroup(), symlink, path);
322    }
323    for (AuditLogger logger : auditLoggers) {
324      if (logger instanceof HdfsAuditLogger) {
325        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
326        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
327            status, ugi, dtSecretManager);
328      } else {
329        logger.logAuditEvent(succeeded, ugi.toString(), addr,
330            cmd, src, dst, status);
331      }
332    }
333  }
334
335  /**
336   * Logger for audit events, noting successful FSNamesystem operations. Emits
337   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
338   * <code>key=value</code> pairs to be written for the following properties:
339   * <code>
340   * ugi=&lt;ugi in RPC&gt;
341   * ip=&lt;remote IP&gt;
342   * cmd=&lt;command&gt;
343   * src=&lt;src path&gt;
344   * dst=&lt;dst path (optional)&gt;
345   * perm=&lt;permissions (optional)&gt;
346   * </code>
347   */
348  public static final Log auditLog = LogFactory.getLog(
349      FSNamesystem.class.getName() + ".audit");
350
351  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
352  static int BLOCK_DELETION_INCREMENT = 1000;
353  private final boolean isPermissionEnabled;
354  private final UserGroupInformation fsOwner;
355  private final String fsOwnerShortUserName;
356  private final String supergroup;
357  private final boolean standbyShouldCheckpoint;
358  
359  // Scan interval is not configurable.
360  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
361    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
362  final DelegationTokenSecretManager dtSecretManager;
363  private final boolean alwaysUseDelegationTokensForTests;
364
365  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
366    new Step(StepType.AWAITING_REPORTED_BLOCKS);
367
368  // Tracks whether the default audit logger is the only configured audit
369  // logger; this allows isAuditEnabled() to return false in case the
370  // underlying logger is disabled, and avoid some unnecessary work.
371  private final boolean isDefaultAuditLogger;
372  private final List<AuditLogger> auditLoggers;
373
374  /** The namespace tree. */
375  FSDirectory dir;
376  private final BlockManager blockManager;
377  private final SnapshotManager snapshotManager;
378  private final CacheManager cacheManager;
379  private final DatanodeStatistics datanodeStatistics;
380
381  // Block pool ID used by this namenode
382  private String blockPoolId;
383
384  final LeaseManager leaseManager = new LeaseManager(this); 
385
386  volatile Daemon smmthread = null;  // SafeModeMonitor thread
387  
388  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
389
390  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
391  /**
392   * When an active namenode will roll its own edit log, in # edits
393   */
394  private final long editLogRollerThreshold;
395  /**
396   * Check interval of an active namenode's edit log roller thread 
397   */
398  private final int editLogRollerInterval;
399
400  private volatile boolean hasResourcesAvailable = false;
401  private volatile boolean fsRunning = true;
402  
403  /** The start time of the namesystem. */
404  private final long startTime = now();
405
406  /** The interval of namenode checking for the disk space availability */
407  private final long resourceRecheckInterval;
408
409  // The actual resource checker instance.
410  NameNodeResourceChecker nnResourceChecker;
411
412  private final FsServerDefaults serverDefaults;
413  private final boolean supportAppends;
414  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
415
416  private volatile SafeModeInfo safeMode;  // safe mode information
417
418  private final long maxFsObjects;          // maximum number of fs objects
419
420  private final long minBlockSize;         // minimum block size
421  private final long maxBlocksPerFile;     // maximum # of blocks per file
422
423  /**
424   * The global generation stamp for legacy blocks with randomly
425   * generated block IDs.
426   */
427  private final GenerationStamp generationStampV1 = new GenerationStamp();
428
429  /**
430   * The global generation stamp for this file system.
431   */
432  private final GenerationStamp generationStampV2 = new GenerationStamp();
433
434  /**
435   * The value of the generation stamp when the first switch to sequential
436   * block IDs was made. Blocks with generation stamps below this value
437   * have randomly allocated block IDs. Blocks with generation stamps above
438   * this value had sequentially allocated block IDs. Read from the fsImage
439   * (or initialized as an offset from the V1 (legacy) generation stamp on
440   * upgrade).
441   */
442  private long generationStampV1Limit =
443      GenerationStamp.GRANDFATHER_GENERATION_STAMP;
444
445  /**
446   * The global block ID space for this file system.
447   */
448  @VisibleForTesting
449  private final SequentialBlockIdGenerator blockIdGenerator;
450
451  // precision of access times.
452  private final long accessTimePrecision;
453
454  /** Lock to protect FSNamesystem. */
455  private FSNamesystemLock fsLock;
456
457  /**
458   * Used when this NN is in standby state to read from the shared edit log.
459   */
460  private EditLogTailer editLogTailer = null;
461
462  /**
463   * Used when this NN is in standby state to perform checkpoints.
464   */
465  private StandbyCheckpointer standbyCheckpointer;
466
467  /**
468   * Reference to the NN's HAContext object. This is only set once
469   * {@link #startCommonServices(Configuration, HAContext)} is called. 
470   */
471  private HAContext haContext;
472
473  private final boolean haEnabled;
474  
475  /**
476   * Whether the namenode is in the middle of starting the active service
477   */
478  private volatile boolean startingActiveService = false;
479    
480  private INodeId inodeId;
481  
482  private final RetryCache retryCache;
483  
484  /**
485   * Set the last allocated inode id when fsimage or editlog is loaded. 
486   */
487  public void resetLastInodeId(long newValue) throws IOException {
488    try {
489      inodeId.skipTo(newValue);
490    } catch(IllegalStateException ise) {
491      throw new IOException(ise);
492    }
493  }
494
495  /** Should only be used for tests to reset to any value */
496  void resetLastInodeIdWithoutChecking(long newValue) {
497    inodeId.setCurrentValue(newValue);
498  }
499  
500  /** @return the last inode ID. */
501  public long getLastInodeId() {
502    return inodeId.getCurrentValue();
503  }
504
505  /** Allocate a new inode ID. */
506  public long allocateNewInodeId() {
507    return inodeId.nextValue();
508  }
509  
510  /**
511   * Clear all loaded data
512   */
513  void clear() {
514    dir.reset();
515    dtSecretManager.reset();
516    generationStampV1.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
517    generationStampV2.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
518    blockIdGenerator.setCurrentValue(
519        SequentialBlockIdGenerator.LAST_RESERVED_BLOCK_ID);
520    generationStampV1Limit = GenerationStamp.GRANDFATHER_GENERATION_STAMP;
521    leaseManager.removeAllLeases();
522    inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
523    snapshotManager.clearSnapshottableDirs();
524    cacheManager.clear();
525  }
526
527  @VisibleForTesting
528  LeaseManager getLeaseManager() {
529    return leaseManager;
530  }
531  
532  /**
533   * Check the supplied configuration for correctness.
534   * @param conf Supplies the configuration to validate.
535   * @throws IOException if the configuration could not be queried.
536   * @throws IllegalArgumentException if the configuration is invalid.
537   */
538  private static void checkConfiguration(Configuration conf)
539      throws IOException {
540
541    final Collection<URI> namespaceDirs =
542        FSNamesystem.getNamespaceDirs(conf);
543    final Collection<URI> editsDirs =
544        FSNamesystem.getNamespaceEditsDirs(conf);
545    final Collection<URI> requiredEditsDirs =
546        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
547    final Collection<URI> sharedEditsDirs =
548        FSNamesystem.getSharedEditsDirs(conf);
549
550    for (URI u : requiredEditsDirs) {
551      if (u.toString().compareTo(
552              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
553        continue;
554      }
555
556      // Each required directory must also be in editsDirs or in
557      // sharedEditsDirs.
558      if (!editsDirs.contains(u) &&
559          !sharedEditsDirs.contains(u)) {
560        throw new IllegalArgumentException(
561            "Required edits directory " + u.toString() + " not present in " +
562            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
563            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
564            editsDirs.toString() + "; " +
565            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
566            requiredEditsDirs.toString() + ". " +
567            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
568            sharedEditsDirs.toString() + ".");
569      }
570    }
571
572    if (namespaceDirs.size() == 1) {
573      LOG.warn("Only one image storage directory ("
574          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of dataloss"
575          + " due to lack of redundant storage directories!");
576    }
577    if (editsDirs.size() == 1) {
578      LOG.warn("Only one namespace edits storage directory ("
579          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of dataloss"
580          + " due to lack of redundant storage directories!");
581    }
582  }
583
584  /**
585   * Instantiates an FSNamesystem loaded from the image and edits
586   * directories specified in the passed Configuration.
587   *
588   * @param conf the Configuration which specifies the storage directories
589   *             from which to load
590   * @return an FSNamesystem which contains the loaded namespace
591   * @throws IOException if loading fails
592   */
593  public static FSNamesystem loadFromDisk(Configuration conf)
594      throws IOException {
595
596    checkConfiguration(conf);
597    FSImage fsImage = new FSImage(conf,
598        FSNamesystem.getNamespaceDirs(conf),
599        FSNamesystem.getNamespaceEditsDirs(conf));
600    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
601    StartupOption startOpt = NameNode.getStartupOption(conf);
602    if (startOpt == StartupOption.RECOVER) {
603      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
604    }
605
606    long loadStart = now();
607    String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
608    try {
609      namesystem.loadFSImage(startOpt, fsImage,
610        HAUtil.isHAEnabled(conf, nameserviceId));
611    } catch (IOException ioe) {
612      LOG.warn("Encountered exception loading fsimage", ioe);
613      fsImage.close();
614      throw ioe;
615    }
616    long timeTakenToLoadFSImage = now() - loadStart;
617    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
618    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
619    if (nnMetrics != null) {
620      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
621    }
622    return namesystem;
623  }
624  
625  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
626    this(conf, fsImage, false);
627  }
628  
629  /**
630   * Create an FSNamesystem associated with the specified image.
631   * 
632   * Note that this does not load any data off of disk -- if you would
633   * like that behavior, use {@link #loadFromDisk(Configuration)}
634   *
635   * @param conf configuration
636   * @param fsImage The FSImage to associate with
637   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
638   *                         step. For Secondary NN this should be set to true.
639   * @throws IOException on bad configuration
640   */
641  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
642      throws IOException {
643    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
644                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
645      LOG.info("Enabling async auditlog");
646      enableAsyncAuditLog();
647    }
648    boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
649    LOG.info("fsLock is fair:" + fair);
650    fsLock = new FSNamesystemLock(fair);
651    try {
652      resourceRecheckInterval = conf.getLong(
653          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
654          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
655
656      this.blockManager = new BlockManager(this, this, conf);
657      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
658      this.blockIdGenerator = new SequentialBlockIdGenerator(this.blockManager);
659
660      this.fsOwner = UserGroupInformation.getCurrentUser();
661      this.fsOwnerShortUserName = fsOwner.getShortUserName();
662      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
663                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
664      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
665                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
666      LOG.info("fsOwner             = " + fsOwner);
667      LOG.info("supergroup          = " + supergroup);
668      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
669
670      // block allocation has to be persisted in HA using a shared edits directory
671      // so that the standby has up-to-date namespace information
672      String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
673      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
674      
675      // Sanity check the HA-related config.
676      if (nameserviceId != null) {
677        LOG.info("Determined nameservice ID: " + nameserviceId);
678      }
679      LOG.info("HA Enabled: " + haEnabled);
680      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
681        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
682        throw new IOException("Invalid configuration: a shared edits dir " +
683            "must not be specified if HA is not enabled.");
684      }
685
686      // Get the checksum type from config
687      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
688      DataChecksum.Type checksumType;
689      try {
690         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
691      } catch (IllegalArgumentException iae) {
692         throw new IOException("Invalid checksum type in "
693            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
694      }
695
696      this.serverDefaults = new FsServerDefaults(
697          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
698          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
699          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
700          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
701          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
702          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
703          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
704          checksumType);
705      
706      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
707                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
708
709      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
710          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
711      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
712          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
713      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
714          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
715      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
716      LOG.info("Append Enabled: " + supportAppends);
717
718      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
719      
720      this.standbyShouldCheckpoint = conf.getBoolean(
721          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
722      // # edit autoroll threshold is a multiple of the checkpoint threshold 
723      this.editLogRollerThreshold = (long)
724          (conf.getFloat(
725              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
726              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
727          conf.getLong(
728              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
729              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
730      this.editLogRollerInterval = conf.getInt(
731          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
732          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
733      this.inodeId = new INodeId();
734      
735      // For testing purposes, allow the DT secret manager to be started regardless
736      // of whether security is enabled.
737      alwaysUseDelegationTokensForTests = conf.getBoolean(
738          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
739          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
740
741      this.dtSecretManager = createDelegationTokenSecretManager(conf);
742      this.dir = new FSDirectory(fsImage, this, conf);
743      this.snapshotManager = new SnapshotManager(dir);
744      this.cacheManager = new CacheManager(this, conf, blockManager);
745      this.safeMode = new SafeModeInfo(conf);
746      this.auditLoggers = initAuditLoggers(conf);
747      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
748        auditLoggers.get(0) instanceof DefaultAuditLogger;
749      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
750    } catch(IOException e) {
751      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
752      close();
753      throw e;
754    } catch (RuntimeException re) {
755      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
756      close();
757      throw re;
758    }
759  }
760  
761  @VisibleForTesting
762  public RetryCache getRetryCache() {
763    return retryCache;
764  }
765  
766  /** Whether or not retry cache is enabled */
767  boolean hasRetryCache() {
768    return retryCache != null;
769  }
770  
771  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
772    if (retryCache != null) {
773      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
774    }
775  }
776  
777  void addCacheEntry(byte[] clientId, int callId) {
778    if (retryCache != null) {
779      retryCache.addCacheEntry(clientId, callId);
780    }
781  }
782  
783  @VisibleForTesting
784  static RetryCache initRetryCache(Configuration conf) {
785    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
786        DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
787    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
788    if (enable) {
789      float heapPercent = conf.getFloat(
790          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
791          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
792      long entryExpiryMillis = conf.getLong(
793          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
794          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
795      LOG.info("Retry cache will use " + heapPercent
796          + " of total heap and retry cache entry expiry time is "
797          + entryExpiryMillis + " millis");
798      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
799      return new RetryCache("Namenode Retry Cache", heapPercent,
800          entryExpiryNanos);
801    }
802    return null;
803  }
804
805  private List<AuditLogger> initAuditLoggers(Configuration conf) {
806    // Initialize the custom access loggers if configured.
807    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
808    List<AuditLogger> auditLoggers = Lists.newArrayList();
809    if (alClasses != null && !alClasses.isEmpty()) {
810      for (String className : alClasses) {
811        try {
812          AuditLogger logger;
813          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
814            logger = new DefaultAuditLogger();
815          } else {
816            logger = (AuditLogger) Class.forName(className).newInstance();
817          }
818          logger.initialize(conf);
819          auditLoggers.add(logger);
820        } catch (RuntimeException re) {
821          throw re;
822        } catch (Exception e) {
823          throw new RuntimeException(e);
824        }
825      }
826    }
827
828    // Make sure there is at least one logger installed.
829    if (auditLoggers.isEmpty()) {
830      auditLoggers.add(new DefaultAuditLogger());
831    }
832    return Collections.unmodifiableList(auditLoggers);
833  }
834
835  void loadFSImage(StartupOption startOpt, FSImage fsImage, boolean haEnabled)
836      throws IOException {
837    // format before starting up if requested
838    if (startOpt == StartupOption.FORMAT) {
839      
840      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
841
842      startOpt = StartupOption.REGULAR;
843    }
844    boolean success = false;
845    writeLock();
846    try {
847      // We shouldn't be calling saveNamespace if we've come up in standby state.
848      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
849      boolean needToSave =
850        fsImage.recoverTransitionRead(startOpt, this, recovery) && !haEnabled;
851      if (needToSave) {
852        fsImage.saveNamespace(this);
853      } else {
854        // No need to save, so mark the phase done.
855        StartupProgress prog = NameNode.getStartupProgress();
856        prog.beginPhase(Phase.SAVING_CHECKPOINT);
857        prog.endPhase(Phase.SAVING_CHECKPOINT);
858      }
859      // This will start a new log segment and write to the seen_txid file, so
860      // we shouldn't do it when coming up in standby state
861      if (!haEnabled) {
862        fsImage.openEditLogForWrite();
863      }
864      success = true;
865    } finally {
866      if (!success) {
867        fsImage.close();
868      }
869      writeUnlock();
870    }
871    dir.imageLoadComplete();
872  }
873
874  private void startSecretManager() {
875    if (dtSecretManager != null) {
876      try {
877        dtSecretManager.startThreads();
878      } catch (IOException e) {
879        // Inability to start secret manager
880        // can't be recovered from.
881        throw new RuntimeException(e);
882      }
883    }
884  }
885  
886  private void startSecretManagerIfNecessary() {
887    boolean shouldRun = shouldUseDelegationTokens() &&
888      !isInSafeMode() && getEditLog().isOpenForWrite();
889    boolean running = dtSecretManager.isRunning();
890    if (shouldRun && !running) {
891      startSecretManager();
892    }
893  }
894
895  private void stopSecretManager() {
896    if (dtSecretManager != null) {
897      dtSecretManager.stopThreads();
898    }
899  }
900  
901  /** 
902   * Start services common to both active and standby states
903   * @param haContext 
904   * @throws IOException
905   */
906  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
907    this.registerMBean(); // register the MBean for the FSNamesystemState
908    writeLock();
909    this.haContext = haContext;
910    try {
911      nnResourceChecker = new NameNodeResourceChecker(conf);
912      checkAvailableResources();
913      assert safeMode != null &&
914        !safeMode.isPopulatingReplQueues();
915      StartupProgress prog = NameNode.getStartupProgress();
916      prog.beginPhase(Phase.SAFEMODE);
917      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
918        getCompleteBlocksTotal());
919      setBlockTotal();
920      blockManager.activate(conf);
921    } finally {
922      writeUnlock();
923    }
924    
925    registerMXBean();
926    DefaultMetricsSystem.instance().register(this);
927  }
928  
929  /** 
930   * Stop services common to both active and standby states
931   * @throws IOException
932   */
933  void stopCommonServices() {
934    writeLock();
935    try {
936      if (blockManager != null) blockManager.close();
937    } finally {
938      writeUnlock();
939    }
940    RetryCache.clear(retryCache);
941  }
942  
943  /**
944   * Start services required in active state
945   * @throws IOException
946   */
947  void startActiveServices() throws IOException {
948    startingActiveService = true;
949    LOG.info("Starting services required for active state");
950    writeLock();
951    try {
952      FSEditLog editLog = dir.fsImage.getEditLog();
953      
954      if (!editLog.isOpenForWrite()) {
955        // During startup, we're already open for write during initialization.
956        editLog.initJournalsForWrite();
957        // May need to recover
958        editLog.recoverUnclosedStreams();
959        
960        LOG.info("Catching up to latest edits from old active before " +
961            "taking over writer role in edits logs");
962        editLogTailer.catchupDuringFailover();
963        
964        blockManager.setPostponeBlocksFromFuture(false);
965        blockManager.getDatanodeManager().markAllDatanodesStale();
966        blockManager.clearQueues();
967        blockManager.processAllPendingDNMessages();
968
969        if (!isInSafeMode() ||
970            (isInSafeMode() && safeMode.isPopulatingReplQueues())) {
971          LOG.info("Reprocessing replication and invalidation queues");
972          blockManager.processMisReplicatedBlocks();
973        }
974        
975        if (LOG.isDebugEnabled()) {
976          LOG.debug("NameNode metadata after re-processing " +
977              "replication and invalidation queues during failover:\n" +
978              metaSaveAsString());
979        }
980        
981        long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
982        LOG.info("Will take over writing edit logs at txnid " + 
983            nextTxId);
984        editLog.setNextTxId(nextTxId);
985
986        dir.fsImage.editLog.openForWrite();
987      }
988      if (haEnabled) {
989        // Renew all of the leases before becoming active.
990        // This is because, while we were in standby mode,
991        // the leases weren't getting renewed on this NN.
992        // Give them all a fresh start here.
993        leaseManager.renewAllLeases();
994      }
995      leaseManager.startMonitor();
996      startSecretManagerIfNecessary();
997
998      //ResourceMonitor required only at ActiveNN. See HDFS-2914
999      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1000      nnrmthread.start();
1001
1002      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1003          editLogRollerThreshold, editLogRollerInterval));
1004      nnEditLogRoller.start();
1005
1006      cacheManager.startMonitorThread();
1007      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1008    } finally {
1009      writeUnlock();
1010      startingActiveService = false;
1011    }
1012  }
1013  
1014  /**
1015   * @return Whether the namenode is transitioning to active state and is in the
1016   *         middle of the {@link #startActiveServices()}
1017   */
1018  public boolean inTransitionToActive() {
1019    return haEnabled && haContext != null
1020        && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1021        && startingActiveService;
1022  }
1023
1024  private boolean shouldUseDelegationTokens() {
1025    return UserGroupInformation.isSecurityEnabled() ||
1026      alwaysUseDelegationTokensForTests;
1027  }
1028
1029  /** 
1030   * Stop services required in active state
1031   * @throws InterruptedException
1032   */
1033  void stopActiveServices() {
1034    LOG.info("Stopping services started for active state");
1035    writeLock();
1036    try {
1037      stopSecretManager();
1038      if (leaseManager != null) {
1039        leaseManager.stopMonitor();
1040      }
1041      if (nnrmthread != null) {
1042        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1043        nnrmthread.interrupt();
1044      }
1045      if (nnEditLogRoller != null) {
1046        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1047        nnEditLogRoller.interrupt();
1048      }
1049      if (dir != null && dir.fsImage != null) {
1050        if (dir.fsImage.editLog != null) {
1051          dir.fsImage.editLog.close();
1052        }
1053        // Update the fsimage with the last txid that we wrote
1054        // so that the tailer starts from the right spot.
1055        dir.fsImage.updateLastAppliedTxIdFromWritten();
1056      }
1057      cacheManager.stopMonitorThread();
1058      cacheManager.clearDirectiveStats();
1059      blockManager.getDatanodeManager().clearPendingCachingCommands();
1060      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1061    } finally {
1062      writeUnlock();
1063    }
1064  }
1065  
1066  /**
1067   * Start services required in standby state 
1068   * 
1069   * @throws IOException
1070   */
1071  void startStandbyServices(final Configuration conf) throws IOException {
1072    LOG.info("Starting services required for standby state");
1073    if (!dir.fsImage.editLog.isOpenForRead()) {
1074      // During startup, we're already open for read.
1075      dir.fsImage.editLog.initSharedJournalsForRead();
1076    }
1077    
1078    blockManager.setPostponeBlocksFromFuture(true);
1079
1080    editLogTailer = new EditLogTailer(this, conf);
1081    editLogTailer.start();
1082    if (standbyShouldCheckpoint) {
1083      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1084      standbyCheckpointer.start();
1085    }
1086  }
1087
1088
1089  /**
1090   * Called while the NN is in Standby state, but just about to be
1091   * asked to enter Active state. This cancels any checkpoints
1092   * currently being taken.
1093   */
1094  void prepareToStopStandbyServices() throws ServiceFailedException {
1095    if (standbyCheckpointer != null) {
1096      standbyCheckpointer.cancelAndPreventCheckpoints(
1097          "About to leave standby state");
1098    }
1099  }
1100
1101  /** Stop services required in standby state */
1102  void stopStandbyServices() throws IOException {
1103    LOG.info("Stopping services started for standby state");
1104    if (standbyCheckpointer != null) {
1105      standbyCheckpointer.stop();
1106    }
1107    if (editLogTailer != null) {
1108      editLogTailer.stop();
1109    }
1110    if (dir != null && dir.fsImage != null && dir.fsImage.editLog != null) {
1111      dir.fsImage.editLog.close();
1112    }
1113  }
1114  
1115  @Override
1116  public void checkOperation(OperationCategory op) throws StandbyException {
1117    if (haContext != null) {
1118      // null in some unit tests
1119      haContext.checkOperation(op);
1120    }
1121  }
1122  
1123  /**
1124   * @throws RetriableException
1125   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1126   *           NameNode is in active state
1127   * @throws SafeModeException
1128   *           Otherwise if NameNode is in SafeMode.
1129   */
1130  private void checkNameNodeSafeMode(String errorMsg)
1131      throws RetriableException, SafeModeException {
1132    if (isInSafeMode()) {
1133      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1134      if (haEnabled && haContext != null
1135          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1136          && shouldRetrySafeMode(this.safeMode)) {
1137        throw new RetriableException(se);
1138      } else {
1139        throw se;
1140      }
1141    }
1142  }
1143  
1144  /**
1145   * We already know that the safemode is on. We will throw a RetriableException
1146   * if the safemode is not manual or caused by low resource.
1147   */
1148  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1149    if (safeMode == null) {
1150      return false;
1151    } else {
1152      return !safeMode.isManual() && !safeMode.areResourcesLow();
1153    }
1154  }
1155  
1156  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1157    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1158  }
1159
1160  /**
1161   * Get all edits dirs which are required. If any shared edits dirs are
1162   * configured, these are also included in the set of required dirs.
1163   * 
1164   * @param conf the HDFS configuration.
1165   * @return all required dirs.
1166   */
1167  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1168    Set<URI> ret = new HashSet<URI>();
1169    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1170    ret.addAll(getSharedEditsDirs(conf));
1171    return ret;
1172  }
1173
1174  private static Collection<URI> getStorageDirs(Configuration conf,
1175                                                String propertyName) {
1176    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1177    StartupOption startOpt = NameNode.getStartupOption(conf);
1178    if(startOpt == StartupOption.IMPORT) {
1179      // In case of IMPORT this will get rid of default directories 
1180      // but will retain directories specified in hdfs-site.xml
1181      // When importing image from a checkpoint, the name-node can
1182      // start with empty set of storage directories.
1183      Configuration cE = new HdfsConfiguration(false);
1184      cE.addResource("core-default.xml");
1185      cE.addResource("core-site.xml");
1186      cE.addResource("hdfs-default.xml");
1187      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1188      dirNames.removeAll(dirNames2);
1189      if(dirNames.isEmpty())
1190        LOG.warn("!!! WARNING !!!" +
1191          "\n\tThe NameNode currently runs without persistent storage." +
1192          "\n\tAny changes to the file system meta-data may be lost." +
1193          "\n\tRecommended actions:" +
1194          "\n\t\t- shutdown and restart NameNode with configured \"" 
1195          + propertyName + "\" in hdfs-site.xml;" +
1196          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1197          "of the file system meta-data.");
1198    } else if (dirNames.isEmpty()) {
1199      dirNames = Collections.singletonList(
1200          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1201    }
1202    return Util.stringCollectionAsURIs(dirNames);
1203  }
1204
1205  /**
1206   * Return an ordered list of edits directories to write to.
1207   * The list is ordered such that all shared edits directories
1208   * are ordered before non-shared directories, and any duplicates
1209   * are removed. The order they are specified in the configuration
1210   * is retained.
1211   * @return Collection of shared edits directories.
1212   * @throws IOException if multiple shared edits directories are configured
1213   */
1214  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1215      throws IOException {
1216    return getNamespaceEditsDirs(conf, true);
1217  }
1218  
1219  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1220      boolean includeShared)
1221      throws IOException {
1222    // Use a LinkedHashSet so that order is maintained while we de-dup
1223    // the entries.
1224    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1225    
1226    if (includeShared) {
1227      List<URI> sharedDirs = getSharedEditsDirs(conf);
1228  
1229      // Fail until multiple shared edits directories are supported (HDFS-2782)
1230      if (sharedDirs.size() > 1) {
1231        throw new IOException(
1232            "Multiple shared edits directories are not yet supported");
1233      }
1234  
1235      // First add the shared edits dirs. It's critical that the shared dirs
1236      // are added first, since JournalSet syncs them in the order they are listed,
1237      // and we need to make sure all edits are in place in the shared storage
1238      // before they are replicated locally. See HDFS-2874.
1239      for (URI dir : sharedDirs) {
1240        if (!editsDirs.add(dir)) {
1241          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1242              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1243        }
1244      }
1245    }    
1246    // Now add the non-shared dirs.
1247    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1248      if (!editsDirs.add(dir)) {
1249        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1250            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1251            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1252      }
1253    }
1254
1255    if (editsDirs.isEmpty()) {
1256      // If this is the case, no edit dirs have been explicitly configured.
1257      // Image dirs are to be used for edits too.
1258      return Lists.newArrayList(getNamespaceDirs(conf));
1259    } else {
1260      return Lists.newArrayList(editsDirs);
1261    }
1262  }
1263  
1264  /**
1265   * Returns edit directories that are shared between primary and secondary.
1266   * @param conf
1267   * @return Collection of edit directories.
1268   */
1269  public static List<URI> getSharedEditsDirs(Configuration conf) {
1270    // don't use getStorageDirs here, because we want an empty default
1271    // rather than the dir in /tmp
1272    Collection<String> dirNames = conf.getTrimmedStringCollection(
1273        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1274    return Util.stringCollectionAsURIs(dirNames);
1275  }
1276
1277  @Override
1278  public void readLock() {
1279    this.fsLock.readLock().lock();
1280  }
1281  @Override
1282  public void readUnlock() {
1283    this.fsLock.readLock().unlock();
1284  }
1285  @Override
1286  public void writeLock() {
1287    this.fsLock.writeLock().lock();
1288  }
1289  @Override
1290  public void writeLockInterruptibly() throws InterruptedException {
1291    this.fsLock.writeLock().lockInterruptibly();
1292  }
1293  @Override
1294  public void writeUnlock() {
1295    this.fsLock.writeLock().unlock();
1296  }
1297  @Override
1298  public boolean hasWriteLock() {
1299    return this.fsLock.isWriteLockedByCurrentThread();
1300  }
1301  @Override
1302  public boolean hasReadLock() {
1303    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1304  }
1305
1306  public int getReadHoldCount() {
1307    return this.fsLock.getReadHoldCount();
1308  }
1309
1310  public int getWriteHoldCount() {
1311    return this.fsLock.getWriteHoldCount();
1312  }
1313
1314  NamespaceInfo getNamespaceInfo() {
1315    readLock();
1316    try {
1317      return unprotectedGetNamespaceInfo();
1318    } finally {
1319      readUnlock();
1320    }
1321  }
1322
1323  /**
1324   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1325   */
1326  NamespaceInfo unprotectedGetNamespaceInfo() {
1327    return new NamespaceInfo(dir.fsImage.getStorage().getNamespaceID(),
1328        getClusterId(), getBlockPoolId(),
1329        dir.fsImage.getStorage().getCTime());
1330  }
1331
1332  /**
1333   * Close down this file system manager.
1334   * Causes heartbeat and lease daemons to stop; waits briefly for
1335   * them to finish, but a short timeout returns control back to caller.
1336   */
1337  void close() {
1338    fsRunning = false;
1339    try {
1340      stopCommonServices();
1341      if (smmthread != null) smmthread.interrupt();
1342    } finally {
1343      // using finally to ensure we also wait for lease daemon
1344      try {
1345        stopActiveServices();
1346        stopStandbyServices();
1347        if (dir != null) {
1348          dir.close();
1349        }
1350      } catch (IOException ie) {
1351        LOG.error("Error closing FSDirectory", ie);
1352        IOUtils.cleanup(LOG, dir);
1353      }
1354    }
1355  }
1356
1357  @Override
1358  public boolean isRunning() {
1359    return fsRunning;
1360  }
1361  
1362  @Override
1363  public boolean isInStandbyState() {
1364    if (haContext == null || haContext.getState() == null) {
1365      // We're still starting up. In this case, if HA is
1366      // on for the cluster, we always start in standby. Otherwise
1367      // start in active.
1368      return haEnabled;
1369    }
1370
1371    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1372  }
1373
1374  /**
1375   * Dump all metadata into specified file
1376   */
1377  void metaSave(String filename) throws IOException {
1378    checkSuperuserPrivilege();
1379    checkOperation(OperationCategory.UNCHECKED);
1380    writeLock();
1381    try {
1382      checkOperation(OperationCategory.UNCHECKED);
1383      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1384      PrintWriter out = new PrintWriter(new BufferedWriter(
1385          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1386      metaSave(out);
1387      out.flush();
1388      out.close();
1389    } finally {
1390      writeUnlock();
1391    }
1392  }
1393
1394  private void metaSave(PrintWriter out) {
1395    assert hasWriteLock();
1396    long totalInodes = this.dir.totalInodes();
1397    long totalBlocks = this.getBlocksTotal();
1398    out.println(totalInodes + " files and directories, " + totalBlocks
1399        + " blocks = " + (totalInodes + totalBlocks) + " total");
1400
1401    blockManager.metaSave(out);
1402  }
1403
1404  private String metaSaveAsString() {
1405    StringWriter sw = new StringWriter();
1406    PrintWriter pw = new PrintWriter(sw);
1407    metaSave(pw);
1408    pw.flush();
1409    return sw.toString();
1410  }
1411  
1412
1413  long getDefaultBlockSize() {
1414    return serverDefaults.getBlockSize();
1415  }
1416
1417  FsServerDefaults getServerDefaults() throws StandbyException {
1418    checkOperation(OperationCategory.READ);
1419    return serverDefaults;
1420  }
1421
1422  long getAccessTimePrecision() {
1423    return accessTimePrecision;
1424  }
1425
1426  private boolean isAccessTimeSupported() {
1427    return accessTimePrecision > 0;
1428  }
1429
1430  /////////////////////////////////////////////////////////
1431  //
1432  // These methods are called by HadoopFS clients
1433  //
1434  /////////////////////////////////////////////////////////
1435  /**
1436   * Set permissions for an existing file.
1437   * @throws IOException
1438   */
1439  void setPermission(String src, FsPermission permission)
1440      throws AccessControlException, FileNotFoundException, SafeModeException,
1441      UnresolvedLinkException, IOException {
1442    try {
1443      setPermissionInt(src, permission);
1444    } catch (AccessControlException e) {
1445      logAuditEvent(false, "setPermission", src);
1446      throw e;
1447    }
1448  }
1449
1450  private void setPermissionInt(String src, FsPermission permission)
1451      throws AccessControlException, FileNotFoundException, SafeModeException,
1452      UnresolvedLinkException, IOException {
1453    HdfsFileStatus resultingStat = null;
1454    FSPermissionChecker pc = getPermissionChecker();
1455    checkOperation(OperationCategory.WRITE);
1456    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1457    writeLock();
1458    try {
1459      checkOperation(OperationCategory.WRITE);
1460      checkNameNodeSafeMode("Cannot set permission for " + src);
1461      src = FSDirectory.resolvePath(src, pathComponents, dir);
1462      checkOwner(pc, src);
1463      dir.setPermission(src, permission);
1464      resultingStat = getAuditFileInfo(src, false);
1465    } finally {
1466      writeUnlock();
1467    }
1468    getEditLog().logSync();
1469    logAuditEvent(true, "setPermission", src, null, resultingStat);
1470  }
1471
1472  /**
1473   * Set owner for an existing file.
1474   * @throws IOException
1475   */
1476  void setOwner(String src, String username, String group)
1477      throws AccessControlException, FileNotFoundException, SafeModeException,
1478      UnresolvedLinkException, IOException {
1479    try {
1480      setOwnerInt(src, username, group);
1481    } catch (AccessControlException e) {
1482      logAuditEvent(false, "setOwner", src);
1483      throw e;
1484    } 
1485  }
1486
1487  private void setOwnerInt(String src, String username, String group)
1488      throws AccessControlException, FileNotFoundException, SafeModeException,
1489      UnresolvedLinkException, IOException {
1490    HdfsFileStatus resultingStat = null;
1491    FSPermissionChecker pc = getPermissionChecker();
1492    checkOperation(OperationCategory.WRITE);
1493    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1494    writeLock();
1495    try {
1496      checkOperation(OperationCategory.WRITE);
1497      checkNameNodeSafeMode("Cannot set owner for " + src);
1498      src = FSDirectory.resolvePath(src, pathComponents, dir);
1499      checkOwner(pc, src);
1500      if (!pc.isSuperUser()) {
1501        if (username != null && !pc.getUser().equals(username)) {
1502          throw new AccessControlException("Non-super user cannot change owner");
1503        }
1504        if (group != null && !pc.containsGroup(group)) {
1505          throw new AccessControlException("User does not belong to " + group);
1506        }
1507      }
1508      dir.setOwner(src, username, group);
1509      resultingStat = getAuditFileInfo(src, false);
1510    } finally {
1511      writeUnlock();
1512    }
1513    getEditLog().logSync();
1514    logAuditEvent(true, "setOwner", src, null, resultingStat);
1515  }
1516
1517  /**
1518   * Get block locations within the specified range.
1519   * @see ClientProtocol#getBlockLocations(String, long, long)
1520   */
1521  LocatedBlocks getBlockLocations(String clientMachine, String src,
1522      long offset, long length) throws AccessControlException,
1523      FileNotFoundException, UnresolvedLinkException, IOException {
1524    LocatedBlocks blocks = getBlockLocations(src, offset, length, true, true,
1525        true);
1526    if (blocks != null) {
1527      blockManager.getDatanodeManager().sortLocatedBlocks(
1528          clientMachine, blocks.getLocatedBlocks());
1529      
1530      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1531      if (lastBlock != null) {
1532        ArrayList<LocatedBlock> lastBlockList = new ArrayList<LocatedBlock>();
1533        lastBlockList.add(lastBlock);
1534        blockManager.getDatanodeManager().sortLocatedBlocks(
1535                              clientMachine, lastBlockList);
1536      }
1537    }
1538    return blocks;
1539  }
1540
1541  /**
1542   * Get block locations within the specified range.
1543   * @see ClientProtocol#getBlockLocations(String, long, long)
1544   * @throws FileNotFoundException, UnresolvedLinkException, IOException
1545   */
1546  LocatedBlocks getBlockLocations(String src, long offset, long length,
1547      boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode)
1548      throws FileNotFoundException, UnresolvedLinkException, IOException {
1549    try {
1550      return getBlockLocationsInt(src, offset, length, doAccessTime,
1551                                  needBlockToken, checkSafeMode);
1552    } catch (AccessControlException e) {
1553      logAuditEvent(false, "open", src);
1554      throw e;
1555    }
1556  }
1557
1558  private LocatedBlocks getBlockLocationsInt(String src, long offset,
1559      long length, boolean doAccessTime, boolean needBlockToken,
1560      boolean checkSafeMode)
1561      throws FileNotFoundException, UnresolvedLinkException, IOException {
1562    if (offset < 0) {
1563      throw new HadoopIllegalArgumentException(
1564          "Negative offset is not supported. File: " + src);
1565    }
1566    if (length < 0) {
1567      throw new HadoopIllegalArgumentException(
1568          "Negative length is not supported. File: " + src);
1569    }
1570    final LocatedBlocks ret = getBlockLocationsUpdateTimes(src,
1571        offset, length, doAccessTime, needBlockToken);  
1572    logAuditEvent(true, "open", src);
1573    if (checkSafeMode && isInSafeMode()) {
1574      for (LocatedBlock b : ret.getLocatedBlocks()) {
1575        // if safemode & no block locations yet then throw safemodeException
1576        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1577          SafeModeException se = new SafeModeException(
1578              "Zero blocklocations for " + src, safeMode);
1579          if (haEnabled && haContext != null && 
1580              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1581            throw new RetriableException(se);
1582          } else {
1583            throw se;
1584          }
1585        }
1586      }
1587    }
1588    return ret;
1589  }
1590
1591  /*
1592   * Get block locations within the specified range, updating the
1593   * access times if necessary. 
1594   */
1595  private LocatedBlocks getBlockLocationsUpdateTimes(String src, long offset,
1596      long length, boolean doAccessTime, boolean needBlockToken)
1597      throws FileNotFoundException,
1598      UnresolvedLinkException, IOException {
1599    FSPermissionChecker pc = getPermissionChecker();
1600    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1601    for (int attempt = 0; attempt < 2; attempt++) {
1602      boolean isReadOp = (attempt == 0);
1603      if (isReadOp) { // first attempt is with readlock
1604        checkOperation(OperationCategory.READ);
1605        readLock();
1606      }  else { // second attempt is with  write lock
1607        checkOperation(OperationCategory.WRITE);
1608        writeLock(); // writelock is needed to set accesstime
1609      }
1610      src = FSDirectory.resolvePath(src, pathComponents, dir);
1611      try {
1612        if (isReadOp) {
1613          checkOperation(OperationCategory.READ);
1614        } else {
1615          checkOperation(OperationCategory.WRITE);
1616        }
1617        if (isPermissionEnabled) {
1618          checkPathAccess(pc, src, FsAction.READ);
1619        }
1620
1621        // if the namenode is in safemode, then do not update access time
1622        if (isInSafeMode()) {
1623          doAccessTime = false;
1624        }
1625
1626        final INodesInPath iip = dir.getLastINodeInPath(src);
1627        final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1628        if (!iip.isSnapshot() //snapshots are readonly, so don't update atime.
1629            && doAccessTime && isAccessTimeSupported()) {
1630          final long now = now();
1631          if (now > inode.getAccessTime() + getAccessTimePrecision()) {
1632            // if we have to set access time but we only have the readlock, then
1633            // restart this entire operation with the writeLock.
1634            if (isReadOp) {
1635              continue;
1636            }
1637            dir.setTimes(src, inode, -1, now, false, iip.getLatestSnapshot());
1638          }
1639        }
1640        final long fileSize = iip.isSnapshot() ?
1641            inode.computeFileSize(iip.getPathSnapshot())
1642            : inode.computeFileSizeNotIncludingLastUcBlock();
1643        boolean isUc = inode.isUnderConstruction();
1644        if (iip.isSnapshot()) {
1645          // if src indicates a snapshot file, we need to make sure the returned
1646          // blocks do not exceed the size of the snapshot file.
1647          length = Math.min(length, fileSize - offset);
1648          isUc = false;
1649        }
1650        LocatedBlocks blocks =
1651          blockManager.createLocatedBlocks(inode.getBlocks(), fileSize,
1652            isUc, offset, length, needBlockToken, iip.isSnapshot());
1653        // Set caching information for the located blocks.
1654        for (LocatedBlock lb: blocks.getLocatedBlocks()) {
1655          cacheManager.setCachedLocations(lb);
1656        }
1657        return blocks;
1658      } finally {
1659        if (isReadOp) {
1660          readUnlock();
1661        } else {
1662          writeUnlock();
1663        }
1664      }
1665    }
1666    return null; // can never reach here
1667  }
1668
1669  /**
1670   * Moves all the blocks from srcs and appends them to trg
1671   * To avoid rollbacks we will verify validitity of ALL of the args
1672   * before we start actual move.
1673   * 
1674   * This does not support ".inodes" relative path
1675   * @param target
1676   * @param srcs
1677   * @throws IOException
1678   */
1679  void concat(String target, String [] srcs) 
1680      throws IOException, UnresolvedLinkException {
1681    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1682    if (cacheEntry != null && cacheEntry.isSuccess()) {
1683      return; // Return previous response
1684    }
1685    
1686    // Either there is no previous request in progres or it has failed
1687    if(FSNamesystem.LOG.isDebugEnabled()) {
1688      FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
1689          " to " + target);
1690    }
1691    
1692    boolean success = false;
1693    try {
1694      concatInt(target, srcs, cacheEntry != null);
1695      success = true;
1696    } catch (AccessControlException e) {
1697      logAuditEvent(false, "concat", Arrays.toString(srcs), target, null);
1698      throw e;
1699    } finally {
1700      RetryCache.setState(cacheEntry, success);
1701    }
1702  }
1703
1704  private void concatInt(String target, String [] srcs, 
1705      boolean logRetryCache) throws IOException, UnresolvedLinkException {
1706    // verify args
1707    if(target.isEmpty()) {
1708      throw new IllegalArgumentException("Target file name is empty");
1709    }
1710    if(srcs == null || srcs.length == 0) {
1711      throw new IllegalArgumentException("No sources given");
1712    }
1713    
1714    // We require all files be in the same directory
1715    String trgParent = 
1716      target.substring(0, target.lastIndexOf(Path.SEPARATOR_CHAR));
1717    for (String s : srcs) {
1718      String srcParent = s.substring(0, s.lastIndexOf(Path.SEPARATOR_CHAR));
1719      if (!srcParent.equals(trgParent)) {
1720        throw new IllegalArgumentException(
1721           "Sources and target are not in the same directory");
1722      }
1723    }
1724
1725    HdfsFileStatus resultingStat = null;
1726    FSPermissionChecker pc = getPermissionChecker();
1727    checkOperation(OperationCategory.WRITE);
1728    writeLock();
1729    try {
1730      checkOperation(OperationCategory.WRITE);
1731      checkNameNodeSafeMode("Cannot concat " + target);
1732      concatInternal(pc, target, srcs, logRetryCache);
1733      resultingStat = getAuditFileInfo(target, false);
1734    } finally {
1735      writeUnlock();
1736    }
1737    getEditLog().logSync();
1738    logAuditEvent(true, "concat", Arrays.toString(srcs), target, resultingStat);
1739  }
1740
1741  /** See {@link #concat(String, String[])} */
1742  private void concatInternal(FSPermissionChecker pc, String target,
1743      String[] srcs, boolean logRetryCache) throws IOException,
1744      UnresolvedLinkException {
1745    assert hasWriteLock();
1746
1747    // write permission for the target
1748    if (isPermissionEnabled) {
1749      checkPathAccess(pc, target, FsAction.WRITE);
1750
1751      // and srcs
1752      for(String aSrc: srcs) {
1753        checkPathAccess(pc, aSrc, FsAction.READ); // read the file
1754        checkParentAccess(pc, aSrc, FsAction.WRITE); // for delete 
1755      }
1756    }
1757
1758    // to make sure no two files are the same
1759    Set<INode> si = new HashSet<INode>();
1760
1761    // we put the following prerequisite for the operation
1762    // replication and blocks sizes should be the same for ALL the blocks
1763
1764    // check the target
1765    final INodeFile trgInode = INodeFile.valueOf(dir.getINode4Write(target),
1766        target);
1767    if(trgInode.isUnderConstruction()) {
1768      throw new HadoopIllegalArgumentException("concat: target file "
1769          + target + " is under construction");
1770    }
1771    // per design target shouldn't be empty and all the blocks same size
1772    if(trgInode.numBlocks() == 0) {
1773      throw new HadoopIllegalArgumentException("concat: target file "
1774          + target + " is empty");
1775    }
1776    if (trgInode instanceof INodeFileWithSnapshot) {
1777      throw new HadoopIllegalArgumentException("concat: target file "
1778          + target + " is in a snapshot");
1779    }
1780
1781    long blockSize = trgInode.getPreferredBlockSize();
1782
1783    // check the end block to be full
1784    final BlockInfo last = trgInode.getLastBlock();
1785    if(blockSize != last.getNumBytes()) {
1786      throw new HadoopIllegalArgumentException("The last block in " + target
1787          + " is not full; last block size = " + last.getNumBytes()
1788          + " but file block size = " + blockSize);
1789    }
1790
1791    si.add(trgInode);
1792    final short repl = trgInode.getFileReplication();
1793
1794    // now check the srcs
1795    boolean endSrc = false; // final src file doesn't have to have full end block
1796    for(int i=0; i<srcs.length; i++) {
1797      String src = srcs[i];
1798      if(i==srcs.length-1)
1799        endSrc=true;
1800
1801      final INodeFile srcInode = INodeFile.valueOf(dir.getINode4Write(src), src);
1802      if(src.isEmpty() 
1803          || srcInode.isUnderConstruction()
1804          || srcInode.numBlocks() == 0) {
1805        throw new HadoopIllegalArgumentException("concat: source file " + src
1806            + " is invalid or empty or underConstruction");
1807      }
1808
1809      // check replication and blocks size
1810      if(repl != srcInode.getBlockReplication()) {
1811        throw new HadoopIllegalArgumentException("concat: the soruce file "
1812            + src + " and the target file " + target
1813            + " should have the same replication: source replication is "
1814            + srcInode.getBlockReplication()
1815            + " but target replication is " + repl);
1816      }
1817
1818      //boolean endBlock=false;
1819      // verify that all the blocks are of the same length as target
1820      // should be enough to check the end blocks
1821      final BlockInfo[] srcBlocks = srcInode.getBlocks();
1822      int idx = srcBlocks.length-1;
1823      if(endSrc)
1824        idx = srcBlocks.length-2; // end block of endSrc is OK not to be full
1825      if(idx >= 0 && srcBlocks[idx].getNumBytes() != blockSize) {
1826        throw new HadoopIllegalArgumentException("concat: the soruce file "
1827            + src + " and the target file " + target
1828            + " should have the same blocks sizes: target block size is "
1829            + blockSize + " but the size of source block " + idx + " is "
1830            + srcBlocks[idx].getNumBytes());
1831      }
1832
1833      si.add(srcInode);
1834    }
1835
1836    // make sure no two files are the same
1837    if(si.size() < srcs.length+1) { // trg + srcs
1838      // it means at least two files are the same
1839      throw new HadoopIllegalArgumentException(
1840          "concat: at least two of the source files are the same");
1841    }
1842
1843    if(NameNode.stateChangeLog.isDebugEnabled()) {
1844      NameNode.stateChangeLog.debug("DIR* NameSystem.concat: " + 
1845          Arrays.toString(srcs) + " to " + target);
1846    }
1847
1848    dir.concat(target,srcs, logRetryCache);
1849  }
1850  
1851  /**
1852   * stores the modification and access time for this inode. 
1853   * The access time is precise upto an hour. The transaction, if needed, is
1854   * written to the edits log but is not flushed.
1855   */
1856  void setTimes(String src, long mtime, long atime) 
1857      throws IOException, UnresolvedLinkException {
1858    if (!isAccessTimeSupported() && atime != -1) {
1859      throw new IOException("Access time for hdfs is not configured. " +
1860                            " Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
1861    }
1862    try {
1863      setTimesInt(src, mtime, atime);
1864    } catch (AccessControlException e) {
1865      logAuditEvent(false, "setTimes", src);
1866      throw e;
1867    }
1868  }
1869
1870  private void setTimesInt(String src, long mtime, long atime) 
1871    throws IOException, UnresolvedLinkException {
1872    HdfsFileStatus resultingStat = null;
1873    FSPermissionChecker pc = getPermissionChecker();
1874    checkOperation(OperationCategory.WRITE);
1875    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1876    writeLock();
1877    try {
1878      checkOperation(OperationCategory.WRITE);
1879      checkNameNodeSafeMode("Cannot set times " + src);
1880      src = FSDirectory.resolvePath(src, pathComponents, dir);
1881
1882      // Write access is required to set access and modification times
1883      if (isPermissionEnabled) {
1884        checkPathAccess(pc, src, FsAction.WRITE);
1885      }
1886      final INodesInPath iip = dir.getINodesInPath4Write(src);
1887      final INode inode = iip.getLastINode();
1888      if (inode != null) {
1889        dir.setTimes(src, inode, mtime, atime, true, iip.getLatestSnapshot());
1890        resultingStat = getAuditFileInfo(src, false);
1891      } else {
1892        throw new FileNotFoundException("File/Directory " + src + " does not exist.");
1893      }
1894    } finally {
1895      writeUnlock();
1896    }
1897    logAuditEvent(true, "setTimes", src, null, resultingStat);
1898  }
1899
1900  /**
1901   * Create a symbolic link.
1902   */
1903  @SuppressWarnings("deprecation")
1904  void createSymlink(String target, String link,
1905      PermissionStatus dirPerms, boolean createParent) 
1906      throws IOException, UnresolvedLinkException {
1907    if (!FileSystem.areSymlinksEnabled()) {
1908      throw new UnsupportedOperationException("Symlinks not supported");
1909    }
1910    if (!DFSUtil.isValidName(link)) {
1911      throw new InvalidPathException("Invalid link name: " + link);
1912    }
1913    if (FSDirectory.isReservedName(target)) {
1914      throw new InvalidPathException("Invalid target name: " + target);
1915    }
1916    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1917    if (cacheEntry != null && cacheEntry.isSuccess()) {
1918      return; // Return previous response
1919    }
1920    boolean success = false;
1921    try {
1922      createSymlinkInt(target, link, dirPerms, createParent, cacheEntry != null);
1923      success = true;
1924    } catch (AccessControlException e) {
1925      logAuditEvent(false, "createSymlink", link, target, null);
1926      throw e;
1927    } finally {
1928      RetryCache.setState(cacheEntry, success);
1929    }
1930  }
1931
1932  private void createSymlinkInt(String target, String link,
1933      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache) 
1934      throws IOException, UnresolvedLinkException {
1935    if (NameNode.stateChangeLog.isDebugEnabled()) {
1936      NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target="
1937          + target + " link=" + link);
1938    }
1939    HdfsFileStatus resultingStat = null;
1940    FSPermissionChecker pc = getPermissionChecker();
1941    checkOperation(OperationCategory.WRITE);
1942    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(link);
1943    writeLock();
1944    try {
1945      checkOperation(OperationCategory.WRITE);
1946      checkNameNodeSafeMode("Cannot create symlink " + link);
1947      link = FSDirectory.resolvePath(link, pathComponents, dir);
1948      if (!createParent) {
1949        verifyParentDir(link);
1950      }
1951      if (!dir.isValidToCreate(link)) {
1952        throw new IOException("failed to create link " + link 
1953            +" either because the filename is invalid or the file exists");
1954      }
1955      if (isPermissionEnabled) {
1956        checkAncestorAccess(pc, link, FsAction.WRITE);
1957      }
1958      // validate that we have enough inodes.
1959      checkFsObjectLimit();
1960
1961      // add symbolic link to namespace
1962      dir.addSymlink(link, target, dirPerms, createParent, logRetryCache);
1963      resultingStat = getAuditFileInfo(link, false);
1964    } finally {
1965      writeUnlock();
1966    }
1967    getEditLog().logSync();
1968    logAuditEvent(true, "createSymlink", link, target, resultingStat);
1969  }
1970
1971  /**
1972   * Set replication for an existing file.
1973   * 
1974   * The NameNode sets new replication and schedules either replication of 
1975   * under-replicated data blocks or removal of the excessive block copies 
1976   * if the blocks are over-replicated.
1977   * 
1978   * @see ClientProtocol#setReplication(String, short)
1979   * @param src file name
1980   * @param replication new replication
1981   * @return true if successful; 
1982   *         false if file does not exist or is a directory
1983   */
1984  boolean setReplication(final String src, final short replication)
1985      throws IOException {
1986    try {
1987      return setReplicationInt(src, replication);
1988    } catch (AccessControlException e) {
1989      logAuditEvent(false, "setReplication", src);
1990      throw e;
1991    }
1992  }
1993
1994  private boolean setReplicationInt(String src, final short replication)
1995      throws IOException {
1996    blockManager.verifyReplication(src, replication, null);
1997    final boolean isFile;
1998    FSPermissionChecker pc = getPermissionChecker();
1999    checkOperation(OperationCategory.WRITE);
2000    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2001    writeLock();
2002    try {
2003      checkOperation(OperationCategory.WRITE);
2004      checkNameNodeSafeMode("Cannot set replication for " + src);
2005      src = FSDirectory.resolvePath(src, pathComponents, dir);
2006      if (isPermissionEnabled) {
2007        checkPathAccess(pc, src, FsAction.WRITE);
2008      }
2009
2010      final short[] blockRepls = new short[2]; // 0: old, 1: new
2011      final Block[] blocks = dir.setReplication(src, replication, blockRepls);
2012      isFile = blocks != null;
2013      if (isFile) {
2014        blockManager.setReplication(blockRepls[0], blockRepls[1], src, blocks);
2015      }
2016    } finally {
2017      writeUnlock();
2018    }
2019
2020    getEditLog().logSync();
2021    if (isFile) {
2022      logAuditEvent(true, "setReplication", src);
2023    }
2024    return isFile;
2025  }
2026
2027  long getPreferredBlockSize(String filename) 
2028      throws IOException, UnresolvedLinkException {
2029    FSPermissionChecker pc = getPermissionChecker();
2030    checkOperation(OperationCategory.READ);
2031    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(filename);
2032    readLock();
2033    try {
2034      checkOperation(OperationCategory.READ);
2035      filename = FSDirectory.resolvePath(filename, pathComponents, dir);
2036      if (isPermissionEnabled) {
2037        checkTraverse(pc, filename);
2038      }
2039      return dir.getPreferredBlockSize(filename);
2040    } finally {
2041      readUnlock();
2042    }
2043  }
2044
2045  /**
2046   * Verify that parent directory of src exists.
2047   */
2048  private void verifyParentDir(String src) throws FileNotFoundException,
2049      ParentNotDirectoryException, UnresolvedLinkException {
2050    assert hasReadLock();
2051    Path parent = new Path(src).getParent();
2052    if (parent != null) {
2053      final INode parentNode = dir.getINode(parent.toString());
2054      if (parentNode == null) {
2055        throw new FileNotFoundException("Parent directory doesn't exist: "
2056            + parent);
2057      } else if (!parentNode.isDirectory() && !parentNode.isSymlink()) {
2058        throw new ParentNotDirectoryException("Parent path is not a directory: "
2059            + parent);
2060      }
2061    }
2062  }
2063  
2064  /**
2065   * Create a new file entry in the namespace.
2066   * 
2067   * For description of parameters and exceptions thrown see
2068   * {@link ClientProtocol#create()}, except it returns valid file status upon
2069   * success
2070   * 
2071   * For retryCache handling details see -
2072   * {@link #getFileStatus(boolean, CacheEntryWithPayload)}
2073   * 
2074   */
2075  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2076      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2077      boolean createParent, short replication, long blockSize)
2078      throws AccessControlException, SafeModeException,
2079      FileAlreadyExistsException, UnresolvedLinkException,
2080      FileNotFoundException, ParentNotDirectoryException, IOException {
2081    HdfsFileStatus status = null;
2082    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2083        null);
2084    if (cacheEntry != null && cacheEntry.isSuccess()) {
2085      return (HdfsFileStatus) cacheEntry.getPayload();
2086    }
2087    
2088    try {
2089      status = startFileInt(src, permissions, holder, clientMachine, flag,
2090          createParent, replication, blockSize, cacheEntry != null);
2091    } catch (AccessControlException e) {
2092      logAuditEvent(false, "create", src);
2093      throw e;
2094    } finally {
2095      RetryCache.setState(cacheEntry, status != null, status);
2096    }
2097    return status;
2098  }
2099
2100  private HdfsFileStatus startFileInt(String src, PermissionStatus permissions,
2101      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2102      boolean createParent, short replication, long blockSize,
2103      boolean logRetryCache) throws AccessControlException, SafeModeException,
2104      FileAlreadyExistsException, UnresolvedLinkException,
2105      FileNotFoundException, ParentNotDirectoryException, IOException {
2106    if (NameNode.stateChangeLog.isDebugEnabled()) {
2107      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: src=" + src
2108          + ", holder=" + holder
2109          + ", clientMachine=" + clientMachine
2110          + ", createParent=" + createParent
2111          + ", replication=" + replication
2112          + ", createFlag=" + flag.toString());
2113    }
2114    if (!DFSUtil.isValidName(src)) {
2115      throw new InvalidPathException(src);
2116    }
2117    blockManager.verifyReplication(src, replication, clientMachine);
2118
2119    boolean skipSync = false;
2120    HdfsFileStatus stat = null;
2121    FSPermissionChecker pc = getPermissionChecker();
2122    checkOperation(OperationCategory.WRITE);
2123    if (blockSize < minBlockSize) {
2124      throw new IOException("Specified block size is less than configured" +
2125          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2126          + "): " + blockSize + " < " + minBlockSize);
2127    }
2128    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2129    boolean create = flag.contains(CreateFlag.CREATE);
2130    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2131    writeLock();
2132    try {
2133      checkOperation(OperationCategory.WRITE);
2134      checkNameNodeSafeMode("Cannot create file" + src);
2135      src = FSDirectory.resolvePath(src, pathComponents, dir);
2136      startFileInternal(pc, src, permissions, holder, clientMachine, create,
2137          overwrite, createParent, replication, blockSize, logRetryCache);
2138      stat = dir.getFileInfo(src, false);
2139    } catch (StandbyException se) {
2140      skipSync = true;
2141      throw se;
2142    } finally {
2143      writeUnlock();
2144      // There might be transactions logged while trying to recover the lease.
2145      // They need to be sync'ed even when an exception was thrown.
2146      if (!skipSync) {
2147        getEditLog().logSync();
2148      }
2149    } 
2150    logAuditEvent(true, "create", src, null, stat);
2151    return stat;
2152  }
2153
2154  /**
2155   * Create a new file or overwrite an existing file<br>
2156   * 
2157   * Once the file is create the client then allocates a new block with the next
2158   * call using {@link NameNode#addBlock()}.
2159   * <p>
2160   * For description of parameters and exceptions thrown see
2161   * {@link ClientProtocol#create()}
2162   */
2163  private void startFileInternal(FSPermissionChecker pc, String src,
2164      PermissionStatus permissions, String holder, String clientMachine,
2165      boolean create, boolean overwrite, boolean createParent,
2166      short replication, long blockSize, boolean logRetryEntry)
2167      throws FileAlreadyExistsException, AccessControlException,
2168      UnresolvedLinkException, FileNotFoundException,
2169      ParentNotDirectoryException, IOException {
2170    assert hasWriteLock();
2171    // Verify that the destination does not exist as a directory already.
2172    final INodesInPath iip = dir.getINodesInPath4Write(src);
2173    final INode inode = iip.getLastINode();
2174    if (inode != null && inode.isDirectory()) {
2175      throw new FileAlreadyExistsException("Cannot create file " + src
2176          + "; already exists as a directory.");
2177    }
2178    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2179    if (isPermissionEnabled) {
2180      if (overwrite && myFile != null) {
2181        checkPathAccess(pc, src, FsAction.WRITE);
2182      } else {
2183        checkAncestorAccess(pc, src, FsAction.WRITE);
2184      }
2185    }
2186
2187    if (!createParent) {
2188      verifyParentDir(src);
2189    }
2190
2191    try {
2192      if (myFile == null) {
2193        if (!create) {
2194          throw new FileNotFoundException("failed to overwrite non-existent file "
2195            + src + " on client " + clientMachine);
2196        }
2197      } else {
2198        if (overwrite) {
2199          try {
2200            deleteInt(src, true, false); // File exists - delete if overwrite
2201          } catch (AccessControlException e) {
2202            logAuditEvent(false, "delete", src);
2203            throw e;
2204          }
2205        } else {
2206          // If lease soft limit time is expired, recover the lease
2207          recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2208          throw new FileAlreadyExistsException("failed to create file " + src
2209              + " on client " + clientMachine + " because the file exists");
2210        }
2211      }
2212
2213      checkFsObjectLimit();
2214      final DatanodeDescriptor clientNode = 
2215          blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2216
2217      INodeFileUnderConstruction newNode = dir.addFile(src, permissions,
2218          replication, blockSize, holder, clientMachine, clientNode);
2219      if (newNode == null) {
2220        throw new IOException("DIR* NameSystem.startFile: " +
2221                              "Unable to add file to namespace.");
2222      }
2223      leaseManager.addLease(newNode.getClientName(), src);
2224
2225      // record file record in log, record new generation stamp
2226      getEditLog().logOpenFile(src, newNode, logRetryEntry);
2227      if (NameNode.stateChangeLog.isDebugEnabled()) {
2228        NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: "
2229                                   +"add "+src+" to namespace for "+holder);
2230      }
2231    } catch (IOException ie) {
2232      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: "
2233                                   +ie.getMessage());
2234      throw ie;
2235    }
2236  }
2237  
2238  /**
2239   * Append to an existing file for append.
2240   * <p>
2241   * 
2242   * The method returns the last block of the file if this is a partial block,
2243   * which can still be used for writing more data. The client uses the returned
2244   * block locations to form the data pipeline for this block.<br>
2245   * The method returns null if the last block is full. The client then
2246   * allocates a new block with the next call using {@link NameNode#addBlock()}.
2247   * <p>
2248   * 
2249   * For description of parameters and exceptions thrown see
2250   * {@link ClientProtocol#append(String, String)}
2251   * 
2252   * @return the last block locations if the block is partial or null otherwise
2253   */
2254  private LocatedBlock appendFileInternal(FSPermissionChecker pc, String src,
2255      String holder, String clientMachine, boolean logRetryCache)
2256      throws AccessControlException, UnresolvedLinkException,
2257      FileNotFoundException, IOException {
2258    assert hasWriteLock();
2259    // Verify that the destination does not exist as a directory already.
2260    final INodesInPath iip = dir.getINodesInPath4Write(src);
2261    final INode inode = iip.getLastINode();
2262    if (inode != null && inode.isDirectory()) {
2263      throw new FileAlreadyExistsException("Cannot append to directory " + src
2264          + "; already exists as a directory.");
2265    }
2266    if (isPermissionEnabled) {
2267      checkPathAccess(pc, src, FsAction.WRITE);
2268    }
2269
2270    try {
2271      if (inode == null) {
2272        throw new FileNotFoundException("failed to append to non-existent file "
2273          + src + " on client " + clientMachine);
2274      }
2275      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2276      // Opening an existing file for write - may need to recover lease.
2277      recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2278      
2279      // recoverLeaseInternal may create a new InodeFile via 
2280      // finalizeINodeFileUnderConstruction so we need to refresh 
2281      // the referenced file.  
2282      myFile = INodeFile.valueOf(dir.getINode(src), src, true);
2283      
2284      final DatanodeDescriptor clientNode = 
2285          blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2286      return prepareFileForWrite(src, myFile, holder, clientMachine, clientNode,
2287          true, iip.getLatestSnapshot(), logRetryCache);
2288    } catch (IOException ie) {
2289      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2290      throw ie;
2291    }
2292  }
2293  
2294  /**
2295   * Replace current node with a INodeUnderConstruction.
2296   * Recreate in-memory lease record.
2297   * 
2298   * @param src path to the file
2299   * @param file existing file object
2300   * @param leaseHolder identifier of the lease holder on this file
2301   * @param clientMachine identifier of the client machine
2302   * @param clientNode if the client is collocated with a DN, that DN's descriptor
2303   * @param writeToEditLog whether to persist this change to the edit log
2304   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2305   *                      rebuilding
2306   * @return the last block locations if the block is partial or null otherwise
2307   * @throws UnresolvedLinkException
2308   * @throws IOException
2309   */
2310  LocatedBlock prepareFileForWrite(String src, INodeFile file,
2311      String leaseHolder, String clientMachine, DatanodeDescriptor clientNode,
2312      boolean writeToEditLog, Snapshot latestSnapshot, boolean logRetryCache)
2313      throws IOException {
2314    file = file.recordModification(latestSnapshot, dir.getINodeMap());
2315    final INodeFileUnderConstruction cons = file.toUnderConstruction(
2316        leaseHolder, clientMachine, clientNode);
2317
2318    dir.replaceINodeFile(src, file, cons);
2319    leaseManager.addLease(cons.getClientName(), src);
2320    
2321    LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons);
2322    if (writeToEditLog) {
2323      getEditLog().logOpenFile(src, cons, logRetryCache);
2324    }
2325    return ret;
2326  }
2327
2328  /**
2329   * Recover lease;
2330   * Immediately revoke the lease of the current lease holder and start lease
2331   * recovery so that the file can be forced to be closed.
2332   * 
2333   * @param src the path of the file to start lease recovery
2334   * @param holder the lease holder's name
2335   * @param clientMachine the client machine's name
2336   * @return true if the file is already closed
2337   * @throws IOException
2338   */
2339  boolean recoverLease(String src, String holder, String clientMachine)
2340      throws IOException {
2341    if (!DFSUtil.isValidName(src)) {
2342      throw new IOException("Invalid file name: " + src);
2343    }
2344  
2345    boolean skipSync = false;
2346    FSPermissionChecker pc = getPermissionChecker();
2347    checkOperation(OperationCategory.WRITE);
2348    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2349    writeLock();
2350    try {
2351      checkOperation(OperationCategory.WRITE);
2352      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2353      src = FSDirectory.resolvePath(src, pathComponents, dir);
2354      final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src);
2355      if (!inode.isUnderConstruction()) {
2356        return true;
2357      }
2358      if (isPermissionEnabled) {
2359        checkPathAccess(pc, src, FsAction.WRITE);
2360      }
2361  
2362      recoverLeaseInternal(inode, src, holder, clientMachine, true);
2363    } catch (StandbyException se) {
2364      skipSync = true;
2365      throw se;
2366    } finally {
2367      writeUnlock();
2368      // There might be transactions logged while trying to recover the lease.
2369      // They need to be sync'ed even when an exception was thrown.
2370      if (!skipSync) {
2371        getEditLog().logSync();
2372      }
2373    }
2374    return false;
2375  }
2376
2377  private void recoverLeaseInternal(INodeFile fileInode, 
2378      String src, String holder, String clientMachine, boolean force)
2379      throws IOException {
2380    assert hasWriteLock();
2381    if (fileInode != null && fileInode.isUnderConstruction()) {
2382      INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) fileInode;
2383      //
2384      // If the file is under construction , then it must be in our
2385      // leases. Find the appropriate lease record.
2386      //
2387      Lease lease = leaseManager.getLease(holder);
2388      //
2389      // We found the lease for this file. And surprisingly the original
2390      // holder is trying to recreate this file. This should never occur.
2391      //
2392      if (!force && lease != null) {
2393        Lease leaseFile = leaseManager.getLeaseByPath(src);
2394        if ((leaseFile != null && leaseFile.equals(lease)) ||
2395            lease.getHolder().equals(holder)) { 
2396          throw new AlreadyBeingCreatedException(
2397            "failed to create file " + src + " for " + holder +
2398            " on client " + clientMachine + 
2399            " because current leaseholder is trying to recreate file.");
2400        }
2401      }
2402      //
2403      // Find the original holder.
2404      //
2405      lease = leaseManager.getLease(pendingFile.getClientName());
2406      if (lease == null) {
2407        throw new AlreadyBeingCreatedException(
2408          "failed to create file " + src + " for " + holder +
2409          " on client " + clientMachine + 
2410          " because pendingCreates is non-null but no leases found.");
2411      }
2412      if (force) {
2413        // close now: no need to wait for soft lease expiration and 
2414        // close only the file src
2415        LOG.info("recoverLease: " + lease + ", src=" + src +
2416          " from client " + pendingFile.getClientName());
2417        internalReleaseLease(lease, src, holder);
2418      } else {
2419        assert lease.getHolder().equals(pendingFile.getClientName()) :
2420          "Current lease holder " + lease.getHolder() +
2421          " does not match file creator " + pendingFile.getClientName();
2422        //
2423        // If the original holder has not renewed in the last SOFTLIMIT 
2424        // period, then start lease recovery.
2425        //
2426        if (lease.expiredSoftLimit()) {
2427          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2428              + pendingFile.getClientName());
2429          boolean isClosed = internalReleaseLease(lease, src, null);
2430          if(!isClosed)
2431            throw new RecoveryInProgressException(
2432                "Failed to close file " + src +
2433                ". Lease recovery is in progress. Try again later.");
2434        } else {
2435          final BlockInfo lastBlock = pendingFile.getLastBlock();
2436          if (lastBlock != null
2437              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2438            throw new RecoveryInProgressException("Recovery in progress, file ["
2439                + src + "], " + "lease owner [" + lease.getHolder() + "]");
2440          } else {
2441            throw new AlreadyBeingCreatedException("Failed to create file ["
2442                + src + "] for [" + holder + "] on client [" + clientMachine
2443                + "], because this file is already being created by ["
2444                + pendingFile.getClientName() + "] on ["
2445                + pendingFile.getClientMachine() + "]");
2446          }
2447        }
2448      }
2449    }
2450  }
2451
2452  /**
2453   * Append to an existing file in the namespace.
2454   */
2455  LocatedBlock appendFile(String src, String holder, String clientMachine)
2456      throws AccessControlException, SafeModeException,
2457      FileAlreadyExistsException, FileNotFoundException,
2458      ParentNotDirectoryException, IOException {
2459    LocatedBlock lb = null;
2460    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2461        null);
2462    if (cacheEntry != null && cacheEntry.isSuccess()) {
2463      return (LocatedBlock) cacheEntry.getPayload();
2464    }
2465      
2466    boolean success = false;
2467    try {
2468      lb = appendFileInt(src, holder, clientMachine, cacheEntry != null);
2469      success = true;
2470      return lb;
2471    } catch (AccessControlException e) {
2472      logAuditEvent(false, "append", src);
2473      throw e;
2474    } finally {
2475      RetryCache.setState(cacheEntry, success, lb);
2476    }
2477  }
2478
2479  private LocatedBlock appendFileInt(String src, String holder,
2480      String clientMachine, boolean logRetryCache)
2481      throws AccessControlException, SafeModeException,
2482      FileAlreadyExistsException, FileNotFoundException,
2483      ParentNotDirectoryException, IOException {
2484    if (NameNode.stateChangeLog.isDebugEnabled()) {
2485      NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
2486          + ", holder=" + holder
2487          + ", clientMachine=" + clientMachine);
2488    }
2489    boolean skipSync = false;
2490    if (!supportAppends) {
2491      throw new UnsupportedOperationException(
2492          "Append is not enabled on this NameNode. Use the " +
2493          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2494    }
2495
2496    LocatedBlock lb = null;
2497    FSPermissionChecker pc = getPermissionChecker();
2498    checkOperation(OperationCategory.WRITE);
2499    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2500    writeLock();
2501    try {
2502      checkOperation(OperationCategory.WRITE);
2503      checkNameNodeSafeMode("Cannot append to file" + src);
2504      src = FSDirectory.resolvePath(src, pathComponents, dir);
2505      lb = appendFileInternal(pc, src, holder, clientMachine, logRetryCache);
2506    } catch (StandbyException se) {
2507      skipSync = true;
2508      throw se;
2509    } finally {
2510      writeUnlock();
2511      // There might be transactions logged while trying to recover the lease.
2512      // They need to be sync'ed even when an exception was thrown.
2513      if (!skipSync) {
2514        getEditLog().logSync();
2515      }
2516    }
2517    if (lb != null) {
2518      if (NameNode.stateChangeLog.isDebugEnabled()) {
2519        NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: file "
2520            +src+" for "+holder+" at "+clientMachine
2521            +" block " + lb.getBlock()
2522            +" block size " + lb.getBlock().getNumBytes());
2523      }
2524    }
2525    logAuditEvent(true, "append", src);
2526    return lb;
2527  }
2528
2529  ExtendedBlock getExtendedBlock(Block blk) {
2530    return new ExtendedBlock(blockPoolId, blk);
2531  }
2532  
2533  void setBlockPoolId(String bpid) {
2534    blockPoolId = bpid;
2535    blockManager.setBlockPoolId(blockPoolId);
2536  }
2537
2538  /**
2539   * The client would like to obtain an additional block for the indicated
2540   * filename (which is being written-to).  Return an array that consists
2541   * of the block, plus a set of machines.  The first on this list should
2542   * be where the client writes data.  Subsequent items in the list must
2543   * be provided in the connection to the first datanode.
2544   *
2545   * Make sure the previous blocks have been reported by datanodes and
2546   * are replicated.  Will return an empty 2-elt array if we want the
2547   * client to "try again later".
2548   */
2549  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
2550      ExtendedBlock previous, Set<Node> excludedNodes, 
2551      List<String> favoredNodes)
2552      throws LeaseExpiredException, NotReplicatedYetException,
2553      QuotaExceededException, SafeModeException, UnresolvedLinkException,
2554      IOException {
2555    long blockSize;
2556    int replication;
2557    DatanodeDescriptor clientNode = null;
2558
2559    if(NameNode.stateChangeLog.isDebugEnabled()) {
2560      NameNode.stateChangeLog.debug(
2561          "BLOCK* NameSystem.getAdditionalBlock: file "
2562          +src+" for "+clientName);
2563    }
2564
2565    // Part I. Analyze the state of the file with respect to the input data.
2566    checkOperation(OperationCategory.READ);
2567    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2568    readLock();
2569    try {
2570      checkOperation(OperationCategory.READ);
2571      src = FSDirectory.resolvePath(src, pathComponents, dir);
2572      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2573      final INode[] inodes = analyzeFileState(
2574          src, fileId, clientName, previous, onRetryBlock).getINodes();
2575      final INodeFileUnderConstruction pendingFile =
2576          (INodeFileUnderConstruction) inodes[inodes.length - 1].asFile();
2577
2578      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
2579        // This is a retry. Just return the last block if having locations.
2580        return onRetryBlock[0];
2581      }
2582      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
2583        throw new IOException("File has reached the limit on maximum number of"
2584            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
2585            + "): " + pendingFile.getBlocks().length + " >= "
2586            + maxBlocksPerFile);
2587      }
2588      blockSize = pendingFile.getPreferredBlockSize();
2589      clientNode = pendingFile.getClientNode();
2590      replication = pendingFile.getFileReplication();
2591    } finally {
2592      readUnlock();
2593    }
2594
2595    // choose targets for the new block to be allocated.
2596    final DatanodeStorageInfo targets[] = getBlockManager().chooseTarget( 
2597        src, replication, clientNode, excludedNodes, blockSize, favoredNodes);
2598
2599    // Part II.
2600    // Allocate a new block, add it to the INode and the BlocksMap. 
2601    Block newBlock = null;
2602    long offset;
2603    checkOperation(OperationCategory.WRITE);
2604    writeLock();
2605    try {
2606      checkOperation(OperationCategory.WRITE);
2607      // Run the full analysis again, since things could have changed
2608      // while chooseTarget() was executing.
2609      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2610      INodesInPath inodesInPath =
2611          analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
2612      final INode[] inodes = inodesInPath.getINodes();
2613      final INodeFileUnderConstruction pendingFile =
2614          (INodeFileUnderConstruction) inodes[inodes.length - 1].asFile();
2615
2616      if (onRetryBlock[0] != null) {
2617        if (onRetryBlock[0].getLocations().length > 0) {
2618          // This is a retry. Just return the last block if having locations.
2619          return onRetryBlock[0];
2620        } else {
2621          // add new chosen targets to already allocated block and return
2622          BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2623          ((BlockInfoUnderConstruction) lastBlockInFile)
2624              .setExpectedLocations(targets);
2625          offset = pendingFile.computeFileSize();
2626          return makeLocatedBlock(lastBlockInFile, targets, offset);
2627        }
2628      }
2629
2630      // commit the last block and complete it if it has minimum replicas
2631      commitOrCompleteLastBlock(pendingFile,
2632                                ExtendedBlock.getLocalBlock(previous));
2633
2634      // allocate new block, record block locations in INode.
2635      newBlock = createNewBlock();
2636      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
2637
2638      dir.persistNewBlock(src, pendingFile);
2639      offset = pendingFile.computeFileSize();
2640    } finally {
2641      writeUnlock();
2642    }
2643    getEditLog().logSync();
2644
2645    // Return located block
2646    return makeLocatedBlock(newBlock, targets, offset);
2647  }
2648
2649  INodesInPath analyzeFileState(String src,
2650                                long fileId,
2651                                String clientName,
2652                                ExtendedBlock previous,
2653                                LocatedBlock[] onRetryBlock)
2654          throws IOException  {
2655    assert hasReadLock();
2656
2657    checkBlock(previous);
2658    onRetryBlock[0] = null;
2659    checkOperation(OperationCategory.WRITE);
2660    checkNameNodeSafeMode("Cannot add block to " + src);
2661
2662    // have we exceeded the configured limit of fs objects.
2663    checkFsObjectLimit();
2664
2665    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
2666    final INodesInPath iip = dir.getINodesInPath4Write(src);
2667    final INodeFileUnderConstruction pendingFile
2668        = checkLease(src, fileId, clientName, iip.getLastINode());
2669    BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2670    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
2671      // The block that the client claims is the current last block
2672      // doesn't match up with what we think is the last block. There are
2673      // four possibilities:
2674      // 1) This is the first block allocation of an append() pipeline
2675      //    which started appending exactly at a block boundary.
2676      //    In this case, the client isn't passed the previous block,
2677      //    so it makes the allocateBlock() call with previous=null.
2678      //    We can distinguish this since the last block of the file
2679      //    will be exactly a full block.
2680      // 2) This is a retry from a client that missed the response of a
2681      //    prior getAdditionalBlock() call, perhaps because of a network
2682      //    timeout, or because of an HA failover. In that case, we know
2683      //    by the fact that the client is re-issuing the RPC that it
2684      //    never began to write to the old block. Hence it is safe to
2685      //    to return the existing block.
2686      // 3) This is an entirely bogus request/bug -- we should error out
2687      //    rather than potentially appending a new block with an empty
2688      //    one in the middle, etc
2689      // 4) This is a retry from a client that timed out while
2690      //    the prior getAdditionalBlock() is still being processed,
2691      //    currently working on chooseTarget(). 
2692      //    There are no means to distinguish between the first and 
2693      //    the second attempts in Part I, because the first one hasn't
2694      //    changed the namesystem state yet.
2695      //    We run this analysis again in Part II where case 4 is impossible.
2696
2697      BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
2698      if (previous == null &&
2699          lastBlockInFile != null &&
2700          lastBlockInFile.getNumBytes() == pendingFile.getPreferredBlockSize() &&
2701          lastBlockInFile.isComplete()) {
2702        // Case 1
2703        if (NameNode.stateChangeLog.isDebugEnabled()) {
2704           NameNode.stateChangeLog.debug(
2705               "BLOCK* NameSystem.allocateBlock: handling block allocation" +
2706               " writing to a file with a complete previous block: src=" +
2707               src + " lastBlock=" + lastBlockInFile);
2708        }
2709      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
2710        if (lastBlockInFile.getNumBytes() != 0) {
2711          throw new IOException(
2712              "Request looked like a retry to allocate block " +
2713              lastBlockInFile + " but it already contains " +
2714              lastBlockInFile.getNumBytes() + " bytes");
2715        }
2716
2717        // Case 2
2718        // Return the last block.
2719        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
2720            "caught retry for allocation of a new block in " +
2721            src + ". Returning previously allocated block " + lastBlockInFile);
2722        long offset = pendingFile.computeFileSize();
2723        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
2724            ((BlockInfoUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
2725            offset);
2726        return iip;
2727      } else {
2728        // Case 3
2729        throw new IOException("Cannot allocate block in " + src + ": " +
2730            "passed 'previous' block " + previous + " does not match actual " +
2731            "last block in file " + lastBlockInFile);
2732      }
2733    }
2734
2735    // Check if the penultimate block is minimally replicated
2736    if (!checkFileProgress(pendingFile, false)) {
2737      throw new NotReplicatedYetException("Not replicated yet: " + src);
2738    }
2739    return iip;
2740  }
2741
2742  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
2743                                        long offset) throws IOException {
2744    LocatedBlock lBlk = new LocatedBlock(
2745        getExtendedBlock(blk), locs, offset, false);
2746    getBlockManager().setBlockToken(
2747        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
2748    return lBlk;
2749  }
2750
2751  /** @see NameNode#getAdditionalDatanode(String, ExtendedBlock, DatanodeInfo[], DatanodeInfo[], int, String) */
2752  LocatedBlock getAdditionalDatanode(String src, final ExtendedBlock blk,
2753      final DatanodeInfo[] existings, final String[] storageIDs,
2754      final Set<Node> excludes,
2755      final int numAdditionalNodes, final String clientName
2756      ) throws IOException {
2757    //check if the feature is enabled
2758    dtpReplaceDatanodeOnFailure.checkEnabled();
2759
2760    final DatanodeDescriptor clientnode;
2761    final long preferredblocksize;
2762    final List<DatanodeStorageInfo> chosen;
2763    checkOperation(OperationCategory.READ);
2764    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2765    readLock();
2766    try {
2767      checkOperation(OperationCategory.READ);
2768      //check safe mode
2769      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
2770      src = FSDirectory.resolvePath(src, pathComponents, dir);
2771
2772      //check lease
2773      final INodeFileUnderConstruction file = checkLease(src, clientName);
2774      clientnode = file.getClientNode();
2775      preferredblocksize = file.getPreferredBlockSize();
2776
2777      //find datanode storages
2778      final DatanodeManager dm = blockManager.getDatanodeManager();
2779      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs));
2780    } finally {
2781      readUnlock();
2782    }
2783
2784    // choose new datanodes.
2785    final DatanodeStorageInfo[] targets = blockManager.getBlockPlacementPolicy(
2786        ).chooseTarget(src, numAdditionalNodes, clientnode, chosen, true,
2787            // TODO: get storage type from the file
2788        excludes, preferredblocksize, StorageType.DEFAULT);
2789    final LocatedBlock lb = new LocatedBlock(blk, targets);
2790    blockManager.setBlockToken(lb, AccessMode.COPY);
2791    return lb;
2792  }
2793
2794  /**
2795   * The client would like to let go of the given block
2796   */
2797  boolean abandonBlock(ExtendedBlock b, String src, String holder)
2798      throws LeaseExpiredException, FileNotFoundException,
2799      UnresolvedLinkException, IOException {
2800    if(NameNode.stateChangeLog.isDebugEnabled()) {
2801      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
2802          + "of file " + src);
2803    }
2804    checkOperation(OperationCategory.WRITE);
2805    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2806    writeLock();
2807    try {
2808      checkOperation(OperationCategory.WRITE);
2809      checkNameNodeSafeMode("Cannot abandon block " + b + " for fle" + src);
2810      src = FSDirectory.resolvePath(src, pathComponents, dir);
2811
2812      //
2813      // Remove the block from the pending creates list
2814      //
2815      INodeFileUnderConstruction file = checkLease(src, holder);
2816      boolean removed = dir.removeBlock(src, file,
2817          ExtendedBlock.getLocalBlock(b));
2818      if (!removed) {
2819        return true;
2820      }
2821      if(NameNode.stateChangeLog.isDebugEnabled()) {
2822        NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
2823                                      + b + " is removed from pendingCreates");
2824      }
2825      dir.persistBlocks(src, file, false);
2826    } finally {
2827      writeUnlock();
2828    }
2829    getEditLog().logSync();
2830
2831    return true;
2832  }
2833  
2834  /** make sure that we still have the lease on this file. */
2835  private INodeFileUnderConstruction checkLease(String src, String holder)
2836      throws LeaseExpiredException, UnresolvedLinkException,
2837      FileNotFoundException {
2838    return checkLease(src, INodeId.GRANDFATHER_INODE_ID, holder,
2839        dir.getINode(src));
2840  }
2841  
2842  private INodeFileUnderConstruction checkLease(String src, long fileId,
2843      String holder, INode inode) throws LeaseExpiredException,
2844      FileNotFoundException {
2845    assert hasReadLock();
2846    if (inode == null || !inode.isFile()) {
2847      Lease lease = leaseManager.getLease(holder);
2848      throw new LeaseExpiredException(
2849          "No lease on " + src + ": File does not exist. "
2850          + (lease != null ? lease.toString()
2851              : "Holder " + holder + " does not have any open files."));
2852    }
2853    final INodeFile file = inode.asFile();
2854    if (!file.isUnderConstruction()) {
2855      Lease lease = leaseManager.getLease(holder);
2856      throw new LeaseExpiredException(
2857          "No lease on " + src + ": File is not open for writing. "
2858          + (lease != null ? lease.toString()
2859              : "Holder " + holder + " does not have any open files."));
2860    }
2861    INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)file;
2862    if (holder != null && !pendingFile.getClientName().equals(holder)) {
2863      throw new LeaseExpiredException("Lease mismatch on " + src + " owned by "
2864          + pendingFile.getClientName() + " but is accessed by " + holder);
2865    }
2866    INodeId.checkId(fileId, pendingFile);
2867    return pendingFile;
2868  }
2869 
2870  /**
2871   * Complete in-progress write to the given file.
2872   * @return true if successful, false if the client should continue to retry
2873   *         (e.g if not all blocks have reached minimum replication yet)
2874   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
2875   */
2876  boolean completeFile(String src, String holder,
2877                       ExtendedBlock last, long fileId)
2878    throws SafeModeException, UnresolvedLinkException, IOException {
2879    if (NameNode.stateChangeLog.isDebugEnabled()) {
2880      NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
2881          src + " for " + holder);
2882    }
2883    checkBlock(last);
2884    boolean success = false;
2885    checkOperation(OperationCategory.WRITE);
2886    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2887    writeLock();
2888    try {
2889      checkOperation(OperationCategory.WRITE);
2890      checkNameNodeSafeMode("Cannot complete file " + src);
2891      src = FSDirectory.resolvePath(src, pathComponents, dir);
2892      success = completeFileInternal(src, holder,
2893        ExtendedBlock.getLocalBlock(last), fileId);
2894    } finally {
2895      writeUnlock();
2896    }
2897    getEditLog().logSync();
2898    if (success) {
2899      NameNode.stateChangeLog.info("DIR* completeFile: " + src
2900          + " is closed by " + holder);
2901    }
2902    return success;
2903  }
2904
2905  private boolean completeFileInternal(String src, 
2906      String holder, Block last, long fileId) throws SafeModeException,
2907      UnresolvedLinkException, IOException {
2908    assert hasWriteLock();
2909    final INodesInPath iip = dir.getLastINodeInPath(src);
2910    final INodeFileUnderConstruction pendingFile;
2911    try {
2912      pendingFile = checkLease(src, fileId, holder, iip.getINode(0));
2913    } catch (LeaseExpiredException lee) {
2914      final INode inode = dir.getINode(src);
2915      if (inode != null
2916          && inode.isFile()
2917          && !inode.asFile().isUnderConstruction()) {
2918        // This could be a retry RPC - i.e the client tried to close
2919        // the file, but missed the RPC response. Thus, it is trying
2920        // again to close the file. If the file still exists and
2921        // the client's view of the last block matches the actual
2922        // last block, then we'll treat it as a successful close.
2923        // See HDFS-3031.
2924        final Block realLastBlock = inode.asFile().getLastBlock();
2925        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
2926          NameNode.stateChangeLog.info("DIR* completeFile: " +
2927              "request from " + holder + " to complete " + src +
2928              " which is already closed. But, it appears to be an RPC " +
2929              "retry. Returning success");
2930          return true;
2931        }
2932      }
2933      throw lee;
2934    }
2935    // Check the state of the penultimate block. It should be completed
2936    // before attempting to complete the last one.
2937    if (!checkFileProgress(pendingFile, false)) {
2938      return false;
2939    }
2940
2941    // commit the last block and complete it if it has minimum replicas
2942    commitOrCompleteLastBlock(pendingFile, last);
2943
2944    if (!checkFileProgress(pendingFile, true)) {
2945      return false;
2946    }
2947
2948    finalizeINodeFileUnderConstruction(src, pendingFile,
2949        iip.getLatestSnapshot());
2950    return true;
2951  }
2952
2953  /**
2954   * Save allocated block at the given pending filename
2955   * 
2956   * @param src path to the file
2957   * @param inodesInPath representing each of the components of src. 
2958   *                     The last INode is the INode for the file.
2959   * @throws QuotaExceededException If addition of block exceeds space quota
2960   */
2961  BlockInfo saveAllocatedBlock(String src, INodesInPath inodes,
2962      Block newBlock, DatanodeStorageInfo[] targets)
2963          throws IOException {
2964    assert hasWriteLock();
2965    BlockInfo b = dir.addBlock(src, inodes, newBlock, targets);
2966    NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + src + ". "
2967        + getBlockPoolId() + " " + b);
2968    DatanodeStorageInfo.incrementBlocksScheduled(targets);
2969    return b;
2970  }
2971
2972  /**
2973   * Create new block with a unique block id and a new generation stamp.
2974   */
2975  Block createNewBlock() throws IOException {
2976    assert hasWriteLock();
2977    Block b = new Block(nextBlockId(), 0, 0);
2978    // Increment the generation stamp for every new block.
2979    b.setGenerationStamp(nextGenerationStamp(false));
2980    return b;
2981  }
2982
2983  /**
2984   * Check that the indicated file's blocks are present and
2985   * replicated.  If not, return false. If checkall is true, then check
2986   * all blocks, otherwise check only penultimate block.
2987   */
2988  boolean checkFileProgress(INodeFile v, boolean checkall) {
2989    readLock();
2990    try {
2991      if (checkall) {
2992        //
2993        // check all blocks of the file.
2994        //
2995        for (BlockInfo block: v.getBlocks()) {
2996          if (!block.isComplete()) {
2997            LOG.info("BLOCK* checkFileProgress: " + block
2998                + " has not reached minimal replication "
2999                + blockManager.minReplication);
3000            return false;
3001          }
3002        }
3003      } else {
3004        //
3005        // check the penultimate block of this file
3006        //
3007        BlockInfo b = v.getPenultimateBlock();
3008        if (b != null && !b.isComplete()) {
3009          LOG.warn("BLOCK* checkFileProgress: " + b
3010              + " has not reached minimal replication "
3011              + blockManager.minReplication);
3012          return false;
3013        }
3014      }
3015      return true;
3016    } finally {
3017      readUnlock();
3018    }
3019  }
3020
3021  ////////////////////////////////////////////////////////////////
3022  // Here's how to handle block-copy failure during client write:
3023  // -- As usual, the client's write should result in a streaming
3024  // backup write to a k-machine sequence.
3025  // -- If one of the backup machines fails, no worries.  Fail silently.
3026  // -- Before client is allowed to close and finalize file, make sure
3027  // that the blocks are backed up.  Namenode may have to issue specific backup
3028  // commands to make up for earlier datanode failures.  Once all copies
3029  // are made, edit namespace and return to client.
3030  ////////////////////////////////////////////////////////////////
3031
3032  /** 
3033   * Change the indicated filename. 
3034   * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} instead.
3035   */
3036  @Deprecated
3037  boolean renameTo(String src, String dst) 
3038      throws IOException, UnresolvedLinkException {
3039    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3040    if (cacheEntry != null && cacheEntry.isSuccess()) {
3041      return true; // Return previous response
3042    }
3043    boolean ret = false;
3044    try {
3045      ret = renameToInt(src, dst, cacheEntry != null);
3046    } catch (AccessControlException e) {
3047      logAuditEvent(false, "rename", src, dst, null);
3048      throw e;
3049    } finally {
3050      RetryCache.setState(cacheEntry, ret);
3051    }
3052    return ret;
3053  }
3054
3055  private boolean renameToInt(String src, String dst, boolean logRetryCache) 
3056    throws IOException, UnresolvedLinkException {
3057    if (NameNode.stateChangeLog.isDebugEnabled()) {
3058      NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src +
3059          " to " + dst);
3060    }
3061    if (!DFSUtil.isValidName(dst)) {
3062      throw new IOException("Invalid name: " + dst);
3063    }
3064    FSPermissionChecker pc = getPermissionChecker();
3065    checkOperation(OperationCategory.WRITE);
3066    byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3067    byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3068    boolean status = false;
3069    HdfsFileStatus resultingStat = null;
3070    writeLock();
3071    try {
3072      checkOperation(OperationCategory.WRITE);
3073      checkNameNodeSafeMode("Cannot rename " + src);
3074      src = FSDirectory.resolvePath(src, srcComponents, dir);
3075      dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3076      checkOperation(OperationCategory.WRITE);
3077      status = renameToInternal(pc, src, dst, logRetryCache);
3078      if (status) {
3079        resultingStat = getAuditFileInfo(dst, false);
3080      }
3081    } finally {
3082      writeUnlock();
3083    }
3084    getEditLog().logSync();
3085    if (status) {
3086      logAuditEvent(true, "rename", src, dst, resultingStat);
3087    }
3088    return status;
3089  }
3090
3091  /** @deprecated See {@link #renameTo(String, String)} */
3092  @Deprecated
3093  private boolean renameToInternal(FSPermissionChecker pc, String src,
3094      String dst, boolean logRetryCache) throws IOException,
3095      UnresolvedLinkException {
3096    assert hasWriteLock();
3097    if (isPermissionEnabled) {
3098      //We should not be doing this.  This is move() not renameTo().
3099      //but for now,
3100      //NOTE: yes, this is bad!  it's assuming much lower level behavior
3101      //      of rewriting the dst
3102      String actualdst = dir.isDir(dst)?
3103          dst + Path.SEPARATOR + new Path(src).getName(): dst;
3104      // Rename does not operates on link targets
3105      // Do not resolveLink when checking permissions of src and dst
3106      // Check write access to parent of src
3107      checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3108      // Check write access to ancestor of dst
3109      checkPermission(pc, actualdst, false, FsAction.WRITE, null, null, null,
3110          false);
3111    }
3112
3113    if (dir.renameTo(src, dst, logRetryCache)) {
3114      return true;
3115    }
3116    return false;
3117  }
3118  
3119
3120  /** Rename src to dst */
3121  void renameTo(String src, String dst, Options.Rename... options)
3122      throws IOException, UnresolvedLinkException {
3123    if (NameNode.stateChangeLog.isDebugEnabled()) {
3124      NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: with options - "
3125          + src + " to " + dst);
3126    }
3127    if (!DFSUtil.isValidName(dst)) {
3128      throw new InvalidPathException("Invalid name: " + dst);
3129    }
3130    final FSPermissionChecker pc = getPermissionChecker();
3131    
3132    checkOperation(OperationCategory.WRITE);
3133    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3134    if (cacheEntry != null && cacheEntry.isSuccess()) {
3135      return; // Return previous response
3136    }
3137    byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3138    byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3139    HdfsFileStatus resultingStat = null;
3140    boolean success = false;
3141    writeLock();
3142    try {
3143      checkOperation(OperationCategory.WRITE);
3144      checkNameNodeSafeMode("Cannot rename " + src);
3145      src = FSDirectory.resolvePath(src, srcComponents, dir);
3146      dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3147      renameToInternal(pc, src, dst, cacheEntry != null, options);
3148      resultingStat = getAuditFileInfo(dst, false);
3149      success = true;
3150    } finally {
3151      writeUnlock();
3152      RetryCache.setState(cacheEntry, success);
3153    }
3154    getEditLog().logSync();
3155    if (resultingStat != null) {
3156      StringBuilder cmd = new StringBuilder("rename options=");
3157      for (Rename option : options) {
3158        cmd.append(option.value()).append(" ");
3159      }
3160      logAuditEvent(true, cmd.toString(), src, dst, resultingStat);
3161    }
3162  }
3163
3164  private void renameToInternal(FSPermissionChecker pc, String src, String dst,
3165      boolean logRetryCache, Options.Rename... options) throws IOException {
3166    assert hasWriteLock();
3167    if (isPermissionEnabled) {
3168      // Rename does not operates on link targets
3169      // Do not resolveLink when checking permissions of src and dst
3170      // Check write access to parent of src
3171      checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3172      // Check write access to ancestor of dst
3173      checkPermission(pc, dst, false, FsAction.WRITE, null, null, null, false);
3174    }
3175
3176    dir.renameTo(src, dst, logRetryCache, options);
3177  }
3178  
3179  /**
3180   * Remove the indicated file from namespace.
3181   * 
3182   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3183   * description of exceptions
3184   */
3185  boolean delete(String src, boolean recursive)
3186      throws AccessControlException, SafeModeException,
3187      UnresolvedLinkException, IOException {
3188    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3189    if (cacheEntry != null && cacheEntry.isSuccess()) {
3190      return true; // Return previous response
3191    }
3192    boolean ret = false;
3193    try {
3194      ret = deleteInt(src, recursive, cacheEntry != null);
3195    } catch (AccessControlException e) {
3196      logAuditEvent(false, "delete", src);
3197      throw e;
3198    } finally {
3199      RetryCache.setState(cacheEntry, ret);
3200    }
3201    return ret;
3202  }
3203      
3204  private boolean deleteInt(String src, boolean recursive, boolean logRetryCache)
3205      throws AccessControlException, SafeModeException,
3206      UnresolvedLinkException, IOException {
3207    if (NameNode.stateChangeLog.isDebugEnabled()) {
3208      NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src);
3209    }
3210    boolean status = deleteInternal(src, recursive, true, logRetryCache);
3211    if (status) {
3212      logAuditEvent(true, "delete", src);
3213    }
3214    return status;
3215  }
3216    
3217  private FSPermissionChecker getPermissionChecker()
3218      throws AccessControlException {
3219    try {
3220      return new FSPermissionChecker(fsOwnerShortUserName, supergroup, getRemoteUser());
3221    } catch (IOException ioe) {
3222      throw new AccessControlException(ioe);
3223    }
3224  }
3225  
3226  /**
3227   * Remove a file/directory from the namespace.
3228   * <p>
3229   * For large directories, deletion is incremental. The blocks under
3230   * the directory are collected and deleted a small number at a time holding
3231   * the {@link FSNamesystem} lock.
3232   * <p>
3233   * For small directory or file the deletion is done in one shot.
3234   * 
3235   * @see ClientProtocol#delete(String, boolean) for description of exceptions
3236   */
3237  private boolean deleteInternal(String src, boolean recursive,
3238      boolean enforcePermission, boolean logRetryCache)
3239      throws AccessControlException, SafeModeException, UnresolvedLinkException,
3240             IOException {
3241    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
3242    List<INode> removedINodes = new ChunkedArrayList<INode>();
3243    FSPermissionChecker pc = getPermissionChecker();
3244    checkOperation(OperationCategory.WRITE);
3245    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3246    boolean ret = false;
3247    writeLock();
3248    try {
3249      checkOperation(OperationCategory.WRITE);
3250      checkNameNodeSafeMode("Cannot delete " + src);
3251      src = FSDirectory.resolvePath(src, pathComponents, dir);
3252      if (!recursive && dir.isNonEmptyDirectory(src)) {
3253        throw new IOException(src + " is non empty");
3254      }
3255      if (enforcePermission && isPermissionEnabled) {
3256        checkPermission(pc, src, false, null, FsAction.WRITE, null,
3257            FsAction.ALL, false);
3258      }
3259      // Unlink the target directory from directory tree
3260      if (!dir.delete(src, collectedBlocks, removedINodes, logRetryCache)) {
3261        return false;
3262      }
3263      ret = true;
3264    } finally {
3265      writeUnlock();
3266    }
3267    getEditLog().logSync(); 
3268    removeBlocks(collectedBlocks); // Incremental deletion of blocks
3269    collectedBlocks.clear();
3270    dir.writeLock();
3271    try {
3272      dir.removeFromInodeMap(removedINodes);
3273    } finally {
3274      dir.writeUnlock();
3275    }
3276    removedINodes.clear();
3277    if (NameNode.stateChangeLog.isDebugEnabled()) {
3278      NameNode.stateChangeLog.debug("DIR* Namesystem.delete: "
3279        + src +" is removed");
3280    }
3281    return ret;
3282  }
3283
3284  /**
3285   * From the given list, incrementally remove the blocks from blockManager
3286   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3287   * ensure that other waiters on the lock can get in. See HDFS-2938
3288   * 
3289   * @param blocks
3290   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3291   *          of blocks that need to be removed from blocksMap
3292   */
3293  void removeBlocks(BlocksMapUpdateInfo blocks) {
3294    List<Block> toDeleteList = blocks.getToDeleteList();
3295    Iterator<Block> iter = toDeleteList.iterator();
3296    while (iter.hasNext()) {
3297      writeLock();
3298      try {
3299        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3300          blockManager.removeBlock(iter.next());
3301        }
3302      } finally {
3303        writeUnlock();
3304      }
3305    }
3306  }
3307  
3308  /**
3309   * Remove leases, inodes and blocks related to a given path
3310   * @param src The given path
3311   * @param blocks Containing the list of blocks to be deleted from blocksMap
3312   * @param removedINodes Containing the list of inodes to be removed from 
3313   *                      inodesMap
3314   */
3315  void removePathAndBlocks(String src, BlocksMapUpdateInfo blocks,
3316      List<INode> removedINodes) {
3317    assert hasWriteLock();
3318    leaseManager.removeLeaseWithPrefixPath(src);
3319    // remove inodes from inodesMap
3320    if (removedINodes != null) {
3321      dir.removeFromInodeMap(removedINodes);
3322      removedINodes.clear();
3323    }
3324    if (blocks == null) {
3325      return;
3326    }
3327    
3328    removeBlocksAndUpdateSafemodeTotal(blocks);
3329  }
3330
3331  /**
3332   * Removes the blocks from blocksmap and updates the safemode blocks total
3333   * 
3334   * @param blocks
3335   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3336   *          of blocks that need to be removed from blocksMap
3337   */
3338  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3339    assert hasWriteLock();
3340    // In the case that we are a Standby tailing edits from the
3341    // active while in safe-mode, we need to track the total number
3342    // of blocks and safe blocks in the system.
3343    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3344    int numRemovedComplete = 0, numRemovedSafe = 0;
3345
3346    for (Block b : blocks.getToDeleteList()) {
3347      if (trackBlockCounts) {
3348        BlockInfo bi = getStoredBlock(b);
3349        if (bi.isComplete()) {
3350          numRemovedComplete++;
3351          if (bi.numNodes() >= blockManager.minReplication) {
3352            numRemovedSafe++;
3353          }
3354        }
3355      }
3356      blockManager.removeBlock(b);
3357    }
3358    if (trackBlockCounts) {
3359      if (LOG.isDebugEnabled()) {
3360        LOG.debug("Adjusting safe-mode totals for deletion."
3361            + "decreasing safeBlocks by " + numRemovedSafe
3362            + ", totalBlocks by " + numRemovedComplete);
3363      }
3364      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3365    }
3366  }
3367
3368  /**
3369   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3370   */
3371  private boolean isSafeModeTrackingBlocks() {
3372    if (!haEnabled) {
3373      // Never track blocks incrementally in non-HA code.
3374      return false;
3375    }
3376    SafeModeInfo sm = this.safeMode;
3377    return sm != null && sm.shouldIncrementallyTrackBlocks();
3378  }
3379
3380  /**
3381   * Get the file info for a specific file.
3382   *
3383   * @param src The string representation of the path to the file
3384   * @param resolveLink whether to throw UnresolvedLinkException 
3385   *        if src refers to a symlink
3386   *
3387   * @throws AccessControlException if access is denied
3388   * @throws UnresolvedLinkException if a symlink is encountered.
3389   *
3390   * @return object containing information regarding the file
3391   *         or null if file not found
3392   * @throws StandbyException 
3393   */
3394  HdfsFileStatus getFileInfo(String src, boolean resolveLink) 
3395    throws AccessControlException, UnresolvedLinkException,
3396           StandbyException, IOException {
3397    if (!DFSUtil.isValidName(src)) {
3398      throw new InvalidPathException("Invalid file name: " + src);
3399    }
3400    HdfsFileStatus stat = null;
3401    FSPermissionChecker pc = getPermissionChecker();
3402    checkOperation(OperationCategory.READ);
3403    if (!DFSUtil.isValidName(src)) {
3404      throw new InvalidPathException("Invalid file name: " + src);
3405    }
3406    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3407    readLock();
3408    try {
3409      checkOperation(OperationCategory.READ);
3410      src = FSDirectory.resolvePath(src, pathComponents, dir);
3411      if (isPermissionEnabled) {
3412        checkPermission(pc, src, false, null, null, null, null, resolveLink);
3413      }
3414      stat = dir.getFileInfo(src, resolveLink);
3415    } catch (AccessControlException e) {
3416      logAuditEvent(false, "getfileinfo", src);
3417      throw e;
3418    } finally {
3419      readUnlock();
3420    }
3421    logAuditEvent(true, "getfileinfo", src);
3422    return stat;
3423  }
3424  
3425  /**
3426   * Returns true if the file is closed
3427   */
3428  boolean isFileClosed(String src) 
3429      throws AccessControlException, UnresolvedLinkException,
3430      StandbyException, IOException {
3431    FSPermissionChecker pc = getPermissionChecker();  
3432    checkOperation(OperationCategory.READ);
3433    readLock();
3434    try {
3435      checkOperation(OperationCategory.READ);
3436      if (isPermissionEnabled) {
3437        checkTraverse(pc, src);
3438      }
3439      return !INodeFile.valueOf(dir.getINode(src), src).isUnderConstruction();
3440    } catch (AccessControlException e) {
3441      if (isAuditEnabled() && isExternalInvocation()) {
3442        logAuditEvent(false, "isFileClosed", src);
3443      }
3444      throw e;
3445    } finally {
3446      readUnlock();
3447    }
3448  }
3449
3450  /**
3451   * Create all the necessary directories
3452   */
3453  boolean mkdirs(String src, PermissionStatus permissions,
3454      boolean createParent) throws IOException, UnresolvedLinkException {
3455    boolean ret = false;
3456    try {
3457      ret = mkdirsInt(src, permissions, createParent);
3458    } catch (AccessControlException e) {
3459      logAuditEvent(false, "mkdirs", src);
3460      throw e;
3461    }
3462    return ret;
3463  }
3464
3465  private boolean mkdirsInt(String src, PermissionStatus permissions,
3466      boolean createParent) throws IOException, UnresolvedLinkException {
3467    if(NameNode.stateChangeLog.isDebugEnabled()) {
3468      NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
3469    }
3470    if (!DFSUtil.isValidName(src)) {
3471      throw new InvalidPathException(src);
3472    }
3473    FSPermissionChecker pc = getPermissionChecker();
3474    checkOperation(OperationCategory.WRITE);
3475    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3476    HdfsFileStatus resultingStat = null;
3477    boolean status = false;
3478    writeLock();
3479    try {
3480      checkOperation(OperationCategory.WRITE);   
3481      checkNameNodeSafeMode("Cannot create directory " + src);
3482      src = FSDirectory.resolvePath(src, pathComponents, dir);
3483      status = mkdirsInternal(pc, src, permissions, createParent);
3484      if (status) {
3485        resultingStat = dir.getFileInfo(src, false);
3486      }
3487    } finally {
3488      writeUnlock();
3489    }
3490    getEditLog().logSync();
3491    if (status) {
3492      logAuditEvent(true, "mkdirs", src, null, resultingStat);
3493    }
3494    return status;
3495  }
3496    
3497  /**
3498   * Create all the necessary directories
3499   */
3500  private boolean mkdirsInternal(FSPermissionChecker pc, String src,
3501      PermissionStatus permissions, boolean createParent) 
3502      throws IOException, UnresolvedLinkException {
3503    assert hasWriteLock();
3504    if (isPermissionEnabled) {
3505      checkTraverse(pc, src);
3506    }
3507    if (dir.isDirMutable(src)) {
3508      // all the users of mkdirs() are used to expect 'true' even if
3509      // a new directory is not created.
3510      return true;
3511    }
3512    if (isPermissionEnabled) {
3513      checkAncestorAccess(pc, src, FsAction.WRITE);
3514    }
3515    if (!createParent) {
3516      verifyParentDir(src);
3517    }
3518
3519    // validate that we have enough inodes. This is, at best, a 
3520    // heuristic because the mkdirs() operation might need to 
3521    // create multiple inodes.
3522    checkFsObjectLimit();
3523
3524    if (!dir.mkdirs(src, permissions, false, now())) {
3525      throw new IOException("Failed to create directory: " + src);
3526    }
3527    return true;
3528  }
3529
3530  /**
3531   * Get the content summary for a specific file/dir.
3532   *
3533   * @param src The string representation of the path to the file
3534   *
3535   * @throws AccessControlException if access is denied
3536   * @throws UnresolvedLinkException if a symlink is encountered.
3537   * @throws FileNotFoundException if no file exists
3538   * @throws StandbyException
3539   * @throws IOException for issues with writing to the audit log
3540   *
3541   * @return object containing information regarding the file
3542   *         or null if file not found
3543   */
3544  ContentSummary getContentSummary(String src) throws IOException {
3545    FSPermissionChecker pc = getPermissionChecker();
3546    checkOperation(OperationCategory.READ);
3547    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3548    readLock();
3549    boolean success = true;
3550    try {
3551      checkOperation(OperationCategory.READ);
3552      src = FSDirectory.resolvePath(src, pathComponents, dir);
3553      if (isPermissionEnabled) {
3554        checkPermission(pc, src, false, null, null, null, FsAction.READ_EXECUTE);
3555      }
3556      return dir.getContentSummary(src);
3557
3558    } catch (AccessControlException ace) {
3559      success = false;
3560      throw ace;
3561    } finally {
3562      readUnlock();
3563      logAuditEvent(success, "contentSummary", src);
3564    }
3565  }
3566
3567  /**
3568   * Set the namespace quota and diskspace quota for a directory.
3569   * See {@link ClientProtocol#setQuota(String, long, long)} for the 
3570   * contract.
3571   * 
3572   * Note: This does not support ".inodes" relative path.
3573   */
3574  void setQuota(String path, long nsQuota, long dsQuota) 
3575      throws IOException, UnresolvedLinkException {
3576    checkSuperuserPrivilege();
3577    checkOperation(OperationCategory.WRITE);
3578    writeLock();
3579    try {
3580      checkOperation(OperationCategory.WRITE);
3581      checkNameNodeSafeMode("Cannot set quota on " + path);
3582      dir.setQuota(path, nsQuota, dsQuota);
3583    } finally {
3584      writeUnlock();
3585    }
3586    getEditLog().logSync();
3587  }
3588
3589  /** Persist all metadata about this file.
3590   * @param src The string representation of the path
3591   * @param clientName The string representation of the client
3592   * @param lastBlockLength The length of the last block 
3593   *                        under construction reported from client.
3594   * @throws IOException if path does not exist
3595   */
3596  void fsync(String src, String clientName, long lastBlockLength) 
3597      throws IOException, UnresolvedLinkException {
3598    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3599    checkOperation(OperationCategory.WRITE);
3600    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3601    writeLock();
3602    try {
3603      checkOperation(OperationCategory.WRITE);
3604      checkNameNodeSafeMode("Cannot fsync file " + src);
3605      src = FSDirectory.resolvePath(src, pathComponents, dir);
3606      INodeFileUnderConstruction pendingFile  = checkLease(src, clientName);
3607      if (lastBlockLength > 0) {
3608        pendingFile.updateLengthOfLastBlock(lastBlockLength);
3609      }
3610      dir.persistBlocks(src, pendingFile, false);
3611    } finally {
3612      writeUnlock();
3613    }
3614    getEditLog().logSync();
3615  }
3616
3617  /**
3618   * Move a file that is being written to be immutable.
3619   * @param src The filename
3620   * @param lease The lease for the client creating the file
3621   * @param recoveryLeaseHolder reassign lease to this holder if the last block
3622   *        needs recovery; keep current holder if null.
3623   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3624   *         replication;<br>
3625   *         RecoveryInProgressException if lease recovery is in progress.<br>
3626   *         IOException in case of an error.
3627   * @return true  if file has been successfully finalized and closed or 
3628   *         false if block recovery has been initiated. Since the lease owner
3629   *         has been changed and logged, caller should call logSync().
3630   */
3631  boolean internalReleaseLease(Lease lease, String src, 
3632      String recoveryLeaseHolder) throws AlreadyBeingCreatedException, 
3633      IOException, UnresolvedLinkException {
3634    LOG.info("Recovering " + lease + ", src=" + src);
3635    assert !isInSafeMode();
3636    assert hasWriteLock();
3637
3638    final INodesInPath iip = dir.getLastINodeInPath(src);
3639    final INodeFileUnderConstruction pendingFile
3640        = INodeFileUnderConstruction.valueOf(iip.getINode(0), src);
3641    int nrBlocks = pendingFile.numBlocks();
3642    BlockInfo[] blocks = pendingFile.getBlocks();
3643
3644    int nrCompleteBlocks;
3645    BlockInfo curBlock = null;
3646    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3647      curBlock = blocks[nrCompleteBlocks];
3648      if(!curBlock.isComplete())
3649        break;
3650      assert blockManager.checkMinReplication(curBlock) :
3651              "A COMPLETE block is not minimally replicated in " + src;
3652    }
3653
3654    // If there are no incomplete blocks associated with this file,
3655    // then reap lease immediately and close the file.
3656    if(nrCompleteBlocks == nrBlocks) {
3657      finalizeINodeFileUnderConstruction(src, pendingFile,
3658          iip.getLatestSnapshot());
3659      NameNode.stateChangeLog.warn("BLOCK*"
3660        + " internalReleaseLease: All existing blocks are COMPLETE,"
3661        + " lease removed, file closed.");
3662      return true;  // closed!
3663    }
3664
3665    // Only the last and the penultimate blocks may be in non COMPLETE state.
3666    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
3667    if(nrCompleteBlocks < nrBlocks - 2 ||
3668       nrCompleteBlocks == nrBlocks - 2 &&
3669         curBlock != null &&
3670         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
3671      final String message = "DIR* NameSystem.internalReleaseLease: "
3672        + "attempt to release a create lock on "
3673        + src + " but file is already closed.";
3674      NameNode.stateChangeLog.warn(message);
3675      throw new IOException(message);
3676    }
3677
3678    // The last block is not COMPLETE, and
3679    // that the penultimate block if exists is either COMPLETE or COMMITTED
3680    final BlockInfo lastBlock = pendingFile.getLastBlock();
3681    BlockUCState lastBlockState = lastBlock.getBlockUCState();
3682    BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
3683    boolean penultimateBlockMinReplication;
3684    BlockUCState penultimateBlockState;
3685    if (penultimateBlock == null) {
3686      penultimateBlockState = BlockUCState.COMPLETE;
3687      // If penultimate block doesn't exist then its minReplication is met
3688      penultimateBlockMinReplication = true;
3689    } else {
3690      penultimateBlockState = BlockUCState.COMMITTED;
3691      penultimateBlockMinReplication = 
3692        blockManager.checkMinReplication(penultimateBlock);
3693    }
3694    assert penultimateBlockState == BlockUCState.COMPLETE ||
3695           penultimateBlockState == BlockUCState.COMMITTED :
3696           "Unexpected state of penultimate block in " + src;
3697
3698    switch(lastBlockState) {
3699    case COMPLETE:
3700      assert false : "Already checked that the last block is incomplete";
3701      break;
3702    case COMMITTED:
3703      // Close file if committed blocks are minimally replicated
3704      if(penultimateBlockMinReplication &&
3705          blockManager.checkMinReplication(lastBlock)) {
3706        finalizeINodeFileUnderConstruction(src, pendingFile,
3707            iip.getLatestSnapshot());
3708        NameNode.stateChangeLog.warn("BLOCK*"
3709          + " internalReleaseLease: Committed blocks are minimally replicated,"
3710          + " lease removed, file closed.");
3711        return true;  // closed!
3712      }
3713      // Cannot close file right now, since some blocks 
3714      // are not yet minimally replicated.
3715      // This may potentially cause infinite loop in lease recovery
3716      // if there are no valid replicas on data-nodes.
3717      String message = "DIR* NameSystem.internalReleaseLease: " +
3718          "Failed to release lease for file " + src +
3719          ". Committed blocks are waiting to be minimally replicated." +
3720          " Try again later.";
3721      NameNode.stateChangeLog.warn(message);
3722      throw new AlreadyBeingCreatedException(message);
3723    case UNDER_CONSTRUCTION:
3724    case UNDER_RECOVERY:
3725      final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)lastBlock;
3726      // setup the last block locations from the blockManager if not known
3727      if (uc.getNumExpectedLocations() == 0) {
3728        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
3729      }
3730
3731      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
3732        // There is no datanode reported to this block.
3733        // may be client have crashed before writing data to pipeline.
3734        // This blocks doesn't need any recovery.
3735        // We can remove this block and close the file.
3736        pendingFile.removeLastBlock(lastBlock);
3737        finalizeINodeFileUnderConstruction(src, pendingFile,
3738            iip.getLatestSnapshot());
3739        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
3740            + "Removed empty last block and closed file.");
3741        return true;
3742      }
3743      // start recovery of the last block for this file
3744      long blockRecoveryId = nextGenerationStamp(isLegacyBlock(uc));
3745      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
3746      uc.initializeBlockRecovery(blockRecoveryId);
3747      leaseManager.renewLease(lease);
3748      // Cannot close file right now, since the last block requires recovery.
3749      // This may potentially cause infinite loop in lease recovery
3750      // if there are no valid replicas on data-nodes.
3751      NameNode.stateChangeLog.warn(
3752                "DIR* NameSystem.internalReleaseLease: " +
3753                "File " + src + " has not been closed." +
3754               " Lease recovery is in progress. " +
3755                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
3756      break;
3757    }
3758    return false;
3759  }
3760
3761  private Lease reassignLease(Lease lease, String src, String newHolder,
3762      INodeFileUnderConstruction pendingFile) {
3763    assert hasWriteLock();
3764    if(newHolder == null)
3765      return lease;
3766    // The following transaction is not synced. Make sure it's sync'ed later.
3767    logReassignLease(lease.getHolder(), src, newHolder);
3768    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
3769  }
3770  
3771  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
3772      INodeFileUnderConstruction pendingFile) {
3773    assert hasWriteLock();
3774    pendingFile.setClientName(newHolder);
3775    return leaseManager.reassignLease(lease, src, newHolder);
3776  }
3777
3778  private void commitOrCompleteLastBlock(final INodeFileUnderConstruction fileINode,
3779      final Block commitBlock) throws IOException {
3780    assert hasWriteLock();
3781    if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
3782      return;
3783    }
3784
3785    // Adjust disk space consumption if required
3786    final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
3787    if (diff > 0) {
3788      try {
3789        String path = leaseManager.findPath(fileINode);
3790        dir.updateSpaceConsumed(path, 0, -diff*fileINode.getFileReplication());
3791      } catch (IOException e) {
3792        LOG.warn("Unexpected exception while updating disk space.", e);
3793      }
3794    }
3795  }
3796
3797  private void finalizeINodeFileUnderConstruction(String src, 
3798      INodeFileUnderConstruction pendingFile, Snapshot latestSnapshot) 
3799      throws IOException, UnresolvedLinkException {
3800    assert hasWriteLock();
3801    leaseManager.removeLease(pendingFile.getClientName(), src);
3802    
3803    pendingFile = pendingFile.recordModification(latestSnapshot,
3804        dir.getINodeMap());
3805
3806    // The file is no longer pending.
3807    // Create permanent INode, update blocks
3808    final INodeFile newFile = pendingFile.toINodeFile(now());
3809    dir.replaceINodeFile(src, pendingFile, newFile);
3810
3811    // close file and persist block allocations for this file
3812    dir.closeFile(src, newFile);
3813
3814    blockManager.checkReplication(newFile);
3815  }
3816
3817  @VisibleForTesting
3818  BlockInfo getStoredBlock(Block block) {
3819    return blockManager.getStoredBlock(block);
3820  }
3821  
3822  @Override
3823  public boolean isInSnapshot(BlockInfoUnderConstruction blockUC) {
3824    assert hasReadLock();
3825    final BlockCollection bc = blockUC.getBlockCollection();
3826    if (bc == null || !(bc instanceof INodeFileUnderConstruction)) {
3827      return false;
3828    }
3829
3830    INodeFileUnderConstruction inodeUC = (INodeFileUnderConstruction) blockUC
3831        .getBlockCollection();
3832    String fullName = inodeUC.getName();
3833    try {
3834      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
3835          && dir.getINode(fullName) == inodeUC) {
3836        // If file exists in normal path then no need to look in snapshot
3837        return false;
3838      }
3839    } catch (UnresolvedLinkException e) {
3840      LOG.error("Error while resolving the link : " + fullName, e);
3841      return false;
3842    }
3843    /*
3844     * 1. if bc is an instance of INodeFileUnderConstructionWithSnapshot, and
3845     * bc is not in the current fsdirectory tree, bc must represent a snapshot
3846     * file. 
3847     * 2. if fullName is not an absolute path, bc cannot be existent in the 
3848     * current fsdirectory tree. 
3849     * 3. if bc is not the current node associated with fullName, bc must be a
3850     * snapshot inode.
3851     */
3852    return true;
3853  }
3854
3855  void commitBlockSynchronization(ExtendedBlock lastblock,
3856      long newgenerationstamp, long newlength,
3857      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
3858      String[] newtargetstorages)
3859      throws IOException, UnresolvedLinkException {
3860    LOG.info("commitBlockSynchronization(lastblock=" + lastblock
3861             + ", newgenerationstamp=" + newgenerationstamp
3862             + ", newlength=" + newlength
3863             + ", newtargets=" + Arrays.asList(newtargets)
3864             + ", closeFile=" + closeFile
3865             + ", deleteBlock=" + deleteblock
3866             + ")");
3867    checkOperation(OperationCategory.WRITE);
3868    String src = "";
3869    writeLock();
3870    try {
3871      checkOperation(OperationCategory.WRITE);
3872      // If a DN tries to commit to the standby, the recovery will
3873      // fail, and the next retry will succeed on the new NN.
3874  
3875      checkNameNodeSafeMode(
3876          "Cannot commitBlockSynchronization while in safe mode");
3877      final BlockInfo storedBlock = getStoredBlock(
3878          ExtendedBlock.getLocalBlock(lastblock));
3879      if (storedBlock == null) {
3880        if (deleteblock) {
3881          // This may be a retry attempt so ignore the failure
3882          // to locate the block.
3883          if (LOG.isDebugEnabled()) {
3884            LOG.debug("Block (=" + lastblock + ") not found");
3885          }
3886          return;
3887        } else {
3888          throw new IOException("Block (=" + lastblock + ") not found");
3889        }
3890      }
3891      INodeFile iFile = ((INode)storedBlock.getBlockCollection()).asFile();
3892      if (!iFile.isUnderConstruction() || storedBlock.isComplete()) {
3893        if (LOG.isDebugEnabled()) {
3894          LOG.debug("Unexpected block (=" + lastblock
3895                    + ") since the file (=" + iFile.getLocalName()
3896                    + ") is not under construction");
3897        }
3898        return;
3899      }
3900
3901      long recoveryId =
3902        ((BlockInfoUnderConstruction)storedBlock).getBlockRecoveryId();
3903      if(recoveryId != newgenerationstamp) {
3904        throw new IOException("The recovery id " + newgenerationstamp
3905                              + " does not match current recovery id "
3906                              + recoveryId + " for block " + lastblock); 
3907      }
3908
3909      INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)iFile;
3910
3911      if (deleteblock) {
3912        Block blockToDel = ExtendedBlock.getLocalBlock(lastblock);
3913        boolean remove = pendingFile.removeLastBlock(blockToDel);
3914        if (remove) {
3915          blockManager.removeBlockFromMap(storedBlock);
3916        }
3917      }
3918      else {
3919        // update last block
3920        storedBlock.setGenerationStamp(newgenerationstamp);
3921        storedBlock.setNumBytes(newlength);
3922
3923        // find the DatanodeDescriptor objects
3924        // There should be no locations in the blockManager till now because the
3925        // file is underConstruction
3926        ArrayList<DatanodeDescriptor> trimmedTargets =
3927            new ArrayList<DatanodeDescriptor>(newtargets.length);
3928        ArrayList<String> trimmedStorages =
3929            new ArrayList<String>(newtargets.length);
3930        if (newtargets.length > 0) {
3931          for (int i = 0; i < newtargets.length; ++i) {
3932            // try to get targetNode
3933            DatanodeDescriptor targetNode =
3934                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
3935            if (targetNode != null) {
3936              trimmedTargets.add(targetNode);
3937              trimmedStorages.add(newtargetstorages[i]);
3938            } else if (LOG.isDebugEnabled()) {
3939              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
3940            }
3941          }
3942        }
3943        if ((closeFile) && !trimmedTargets.isEmpty()) {
3944          // the file is getting closed. Insert block locations into blockManager.
3945          // Otherwise fsck will report these blocks as MISSING, especially if the
3946          // blocksReceived from Datanodes take a long time to arrive.
3947          for (int i = 0; i < trimmedTargets.size(); i++) {
3948            trimmedTargets.get(i).addBlock(
3949              trimmedStorages.get(i), storedBlock);
3950          }
3951        }
3952
3953        // add pipeline locations into the INodeUnderConstruction
3954        DatanodeStorageInfo[] trimmedStorageInfos =
3955            blockManager.getDatanodeManager().getDatanodeStorageInfos(
3956                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
3957                trimmedStorages.toArray(new String[trimmedStorages.size()]));
3958        pendingFile.setLastBlock(storedBlock, trimmedStorageInfos);
3959      }
3960
3961      if (closeFile) {
3962        src = closeFileCommitBlocks(pendingFile, storedBlock);
3963      } else {
3964        // If this commit does not want to close the file, persist blocks
3965        src = persistBlocks(pendingFile, false);
3966      }
3967    } finally {
3968      writeUnlock();
3969    }
3970    getEditLog().logSync();
3971    if (closeFile) {
3972      LOG.info("commitBlockSynchronization(newblock=" + lastblock
3973          + ", file=" + src
3974          + ", newgenerationstamp=" + newgenerationstamp
3975          + ", newlength=" + newlength
3976          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
3977    } else {
3978      LOG.info("commitBlockSynchronization(" + lastblock + ") successful");
3979    }
3980  }
3981
3982  /**
3983   *
3984   * @param pendingFile
3985   * @param storedBlock
3986   * @return Path of the file that was closed.
3987   * @throws IOException
3988   */
3989  @VisibleForTesting
3990  String closeFileCommitBlocks(INodeFileUnderConstruction pendingFile,
3991                                       BlockInfo storedBlock)
3992      throws IOException {
3993
3994    String src = leaseManager.findPath(pendingFile);
3995
3996    // commit the last block and complete it if it has minimum replicas
3997    commitOrCompleteLastBlock(pendingFile, storedBlock);
3998
3999    //remove lease, close file
4000    finalizeINodeFileUnderConstruction(src, pendingFile,
4001                                       Snapshot.findLatestSnapshot(pendingFile, null));
4002
4003    return src;
4004  }
4005
4006  /**
4007   * Persist the block list for the given file.
4008   *
4009   * @param pendingFile
4010   * @return Path to the given file.
4011   * @throws IOException
4012   */
4013  @VisibleForTesting
4014  String persistBlocks(INodeFileUnderConstruction pendingFile,
4015      boolean logRetryCache) throws IOException {
4016    String src = leaseManager.findPath(pendingFile);
4017    dir.persistBlocks(src, pendingFile, logRetryCache);
4018    return src;
4019  }
4020
4021  /**
4022   * Renew the lease(s) held by the given client
4023   */
4024  void renewLease(String holder) throws IOException {
4025    checkOperation(OperationCategory.WRITE);
4026    readLock();
4027    try {
4028      checkOperation(OperationCategory.WRITE);
4029      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4030      leaseManager.renewLease(holder);
4031    } finally {
4032      readUnlock();
4033    }
4034  }
4035
4036  /**
4037   * Get a partial listing of the indicated directory
4038   *
4039   * @param src the directory name
4040   * @param startAfter the name to start after
4041   * @param needLocation if blockLocations need to be returned
4042   * @return a partial listing starting after startAfter
4043   * 
4044   * @throws AccessControlException if access is denied
4045   * @throws UnresolvedLinkException if symbolic link is encountered
4046   * @throws IOException if other I/O error occurred
4047   */
4048  DirectoryListing getListing(String src, byte[] startAfter,
4049      boolean needLocation) 
4050      throws AccessControlException, UnresolvedLinkException, IOException {
4051    try {
4052      return getListingInt(src, startAfter, needLocation);
4053    } catch (AccessControlException e) {
4054      logAuditEvent(false, "listStatus", src);
4055      throw e;
4056    }
4057  }
4058
4059  private DirectoryListing getListingInt(String src, byte[] startAfter,
4060      boolean needLocation) 
4061    throws AccessControlException, UnresolvedLinkException, IOException {
4062    DirectoryListing dl;
4063    FSPermissionChecker pc = getPermissionChecker();
4064    checkOperation(OperationCategory.READ);
4065    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4066    String startAfterString = new String(startAfter);
4067    readLock();
4068    try {
4069      checkOperation(OperationCategory.READ);
4070      src = FSDirectory.resolvePath(src, pathComponents, dir);
4071
4072      // Get file name when startAfter is an INodePath
4073      if (FSDirectory.isReservedName(startAfterString)) {
4074        byte[][] startAfterComponents = FSDirectory
4075            .getPathComponentsForReservedPath(startAfterString);
4076        try {
4077          String tmp = FSDirectory.resolvePath(src, startAfterComponents, dir);
4078          byte[][] regularPath = INode.getPathComponents(tmp);
4079          startAfter = regularPath[regularPath.length - 1];
4080        } catch (IOException e) {
4081          // Possibly the inode is deleted
4082          throw new DirectoryListingStartAfterNotFoundException(
4083              "Can't find startAfter " + startAfterString);
4084        }
4085      }
4086      
4087      if (isPermissionEnabled) {
4088        if (dir.isDir(src)) {
4089          checkPathAccess(pc, src, FsAction.READ_EXECUTE);
4090        } else {
4091          checkTraverse(pc, src);
4092        }
4093      }
4094      logAuditEvent(true, "listStatus", src);
4095      dl = dir.getListing(src, startAfter, needLocation);
4096    } finally {
4097      readUnlock();
4098    }
4099    return dl;
4100  }
4101
4102  /////////////////////////////////////////////////////////
4103  //
4104  // These methods are called by datanodes
4105  //
4106  /////////////////////////////////////////////////////////
4107  /**
4108   * Register Datanode.
4109   * <p>
4110   * The purpose of registration is to identify whether the new datanode
4111   * serves a new data storage, and will report new data block copies,
4112   * which the namenode was not aware of; or the datanode is a replacement
4113   * node for the data storage that was previously served by a different
4114   * or the same (in terms of host:port) datanode.
4115   * The data storages are distinguished by their storageIDs. When a new
4116   * data storage is reported the namenode issues a new unique storageID.
4117   * <p>
4118   * Finally, the namenode returns its namespaceID as the registrationID
4119   * for the datanodes. 
4120   * namespaceID is a persistent attribute of the name space.
4121   * The registrationID is checked every time the datanode is communicating
4122   * with the namenode. 
4123   * Datanodes with inappropriate registrationID are rejected.
4124   * If the namenode stops, and then restarts it can restore its 
4125   * namespaceID and will continue serving the datanodes that has previously
4126   * registered with the namenode without restarting the whole cluster.
4127   * 
4128   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4129   */
4130  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4131    writeLock();
4132    try {
4133      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4134      checkSafeMode();
4135    } finally {
4136      writeUnlock();
4137    }
4138  }
4139  
4140  /**
4141   * Get registrationID for datanodes based on the namespaceID.
4142   * 
4143   * @see #registerDatanode(DatanodeRegistration)
4144   * @return registration ID
4145   */
4146  String getRegistrationID() {
4147    return Storage.getRegistrationID(dir.fsImage.getStorage());
4148  }
4149
4150  /**
4151   * The given node has reported in.  This method should:
4152   * 1) Record the heartbeat, so the datanode isn't timed out
4153   * 2) Adjust usage stats for future block allocation
4154   * 
4155   * If a substantial amount of time passed since the last datanode 
4156   * heartbeat then request an immediate block report.  
4157   * 
4158   * @return an array of datanode commands 
4159   * @throws IOException
4160   */
4161  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4162      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4163      int xceiverCount, int xmitsInProgress, int failedVolumes)
4164        throws IOException {
4165    readLock();
4166    try {
4167      final int maxTransfer = blockManager.getMaxReplicationStreams()
4168          - xmitsInProgress;
4169      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4170          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4171          xceiverCount, maxTransfer, failedVolumes);
4172      return new HeartbeatResponse(cmds, createHaStatusHeartbeat());
4173    } finally {
4174      readUnlock();
4175    }
4176  }
4177
4178  private NNHAStatusHeartbeat createHaStatusHeartbeat() {
4179    HAState state = haContext.getState();
4180    return new NNHAStatusHeartbeat(state.getServiceState(),
4181        getFSImage().getLastAppliedOrWrittenTxId());
4182  }
4183
4184  /**
4185   * Returns whether or not there were available resources at the last check of
4186   * resources.
4187   *
4188   * @return true if there were sufficient resources available, false otherwise.
4189   */
4190  boolean nameNodeHasResourcesAvailable() {
4191    return hasResourcesAvailable;
4192  }
4193
4194  /**
4195   * Perform resource checks and cache the results.
4196   * @throws IOException
4197   */
4198  void checkAvailableResources() {
4199    Preconditions.checkState(nnResourceChecker != null,
4200        "nnResourceChecker not initialized");
4201    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4202  }
4203
4204  /**
4205   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4206   * there are found to be insufficient resources available, causes the NN to
4207   * enter safe mode. If resources are later found to have returned to
4208   * acceptable levels, this daemon will cause the NN to exit safe mode.
4209   */
4210  class NameNodeResourceMonitor implements Runnable  {
4211    boolean shouldNNRmRun = true;
4212    @Override
4213    public void run () {
4214      try {
4215        while (fsRunning && shouldNNRmRun) {
4216          checkAvailableResources();
4217          if(!nameNodeHasResourcesAvailable()) {
4218            String lowResourcesMsg = "NameNode low on available disk space. ";
4219            if (!isInSafeMode()) {
4220              FSNamesystem.LOG.warn(lowResourcesMsg + "Entering safe mode.");
4221            } else {
4222              FSNamesystem.LOG.warn(lowResourcesMsg + "Already in safe mode.");
4223            }
4224            enterSafeMode(true);
4225          }
4226          try {
4227            Thread.sleep(resourceRecheckInterval);
4228          } catch (InterruptedException ie) {
4229            // Deliberately ignore
4230          }
4231        }
4232      } catch (Exception e) {
4233        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4234      }
4235    }
4236
4237    public void stopMonitor() {
4238      shouldNNRmRun = false;
4239    }
4240 }
4241
4242  class NameNodeEditLogRoller implements Runnable {
4243
4244    private boolean shouldRun = true;
4245    private final long rollThreshold;
4246    private final long sleepIntervalMs;
4247
4248    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4249        this.rollThreshold = rollThreshold;
4250        this.sleepIntervalMs = sleepIntervalMs;
4251    }
4252
4253    @Override
4254    public void run() {
4255      while (fsRunning && shouldRun) {
4256        try {
4257          FSEditLog editLog = getFSImage().getEditLog();
4258          long numEdits =
4259              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4260          if (numEdits > rollThreshold) {
4261            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4262                + " number of edits in open segment exceeds threshold of "
4263                + rollThreshold);
4264            rollEditLog();
4265          }
4266          Thread.sleep(sleepIntervalMs);
4267        } catch (InterruptedException e) {
4268          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4269              + " was interrupted, exiting");
4270          break;
4271        } catch (Exception e) {
4272          FSNamesystem.LOG.error("Swallowing exception in "
4273              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4274        }
4275      }
4276    }
4277
4278    public void stop() {
4279      shouldRun = false;
4280    }
4281  }
4282
4283  public FSImage getFSImage() {
4284    return dir.fsImage;
4285  }
4286
4287  public FSEditLog getEditLog() {
4288    return getFSImage().getEditLog();
4289  }    
4290
4291  private void checkBlock(ExtendedBlock block) throws IOException {
4292    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4293      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4294          + " - expected " + blockPoolId);
4295    }
4296  }
4297
4298  @Metric({"MissingBlocks", "Number of missing blocks"})
4299  public long getMissingBlocksCount() {
4300    // not locking
4301    return blockManager.getMissingBlocksCount();
4302  }
4303  
4304  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4305  public int getExpiredHeartbeats() {
4306    return datanodeStatistics.getExpiredHeartbeats();
4307  }
4308  
4309  @Metric({"TransactionsSinceLastCheckpoint",
4310      "Number of transactions since last checkpoint"})
4311  public long getTransactionsSinceLastCheckpoint() {
4312    return getEditLog().getLastWrittenTxId() -
4313        getFSImage().getStorage().getMostRecentCheckpointTxId();
4314  }
4315  
4316  @Metric({"TransactionsSinceLastLogRoll",
4317      "Number of transactions since last edit log roll"})
4318  public long getTransactionsSinceLastLogRoll() {
4319    if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
4320      return 0;
4321    } else {
4322      return getEditLog().getLastWrittenTxId() -
4323        getEditLog().getCurSegmentTxId() + 1;
4324    }
4325  }
4326  
4327  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4328  public long getLastWrittenTransactionId() {
4329    return getEditLog().getLastWrittenTxId();
4330  }
4331  
4332  @Metric({"LastCheckpointTime",
4333      "Time in milliseconds since the epoch of the last checkpoint"})
4334  public long getLastCheckpointTime() {
4335    return getFSImage().getStorage().getMostRecentCheckpointTime();
4336  }
4337
4338  /** @see ClientProtocol#getStats() */
4339  long[] getStats() {
4340    final long[] stats = datanodeStatistics.getStats();
4341    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4342    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4343    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4344    return stats;
4345  }
4346
4347  @Override // FSNamesystemMBean
4348  @Metric({"CapacityTotal",
4349      "Total raw capacity of data nodes in bytes"})
4350  public long getCapacityTotal() {
4351    return datanodeStatistics.getCapacityTotal();
4352  }
4353
4354  @Metric({"CapacityTotalGB",
4355      "Total raw capacity of data nodes in GB"})
4356  public float getCapacityTotalGB() {
4357    return DFSUtil.roundBytesToGB(getCapacityTotal());
4358  }
4359
4360  @Override // FSNamesystemMBean
4361  @Metric({"CapacityUsed",
4362      "Total used capacity across all data nodes in bytes"})
4363  public long getCapacityUsed() {
4364    return datanodeStatistics.getCapacityUsed();
4365  }
4366
4367  @Metric({"CapacityUsedGB",
4368      "Total used capacity across all data nodes in GB"})
4369  public float getCapacityUsedGB() {
4370    return DFSUtil.roundBytesToGB(getCapacityUsed());
4371  }
4372
4373  @Override // FSNamesystemMBean
4374  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4375  public long getCapacityRemaining() {
4376    return datanodeStatistics.getCapacityRemaining();
4377  }
4378
4379  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4380  public float getCapacityRemainingGB() {
4381    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4382  }
4383
4384  @Metric({"CapacityUsedNonDFS",
4385      "Total space used by data nodes for non DFS purposes in bytes"})
4386  public long getCapacityUsedNonDFS() {
4387    return datanodeStatistics.getCapacityUsedNonDFS();
4388  }
4389
4390  /**
4391   * Total number of connections.
4392   */
4393  @Override // FSNamesystemMBean
4394  @Metric
4395  public int getTotalLoad() {
4396    return datanodeStatistics.getXceiverCount();
4397  }
4398  
4399  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4400  public int getNumSnapshottableDirs() {
4401    return this.snapshotManager.getNumSnapshottableDirs();
4402  }
4403
4404  @Metric({ "Snapshots", "The number of snapshots" })
4405  public int getNumSnapshots() {
4406    return this.snapshotManager.getNumSnapshots();
4407  }
4408
4409  @Override
4410  public String getSnapshotStats() {
4411    Map<String, Object> info = new HashMap<String, Object>();
4412    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4413    info.put("Snapshots", this.getNumSnapshots());
4414    return JSON.toString(info);
4415  }
4416
4417  int getNumberOfDatanodes(DatanodeReportType type) {
4418    readLock();
4419    try {
4420      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4421          type).size(); 
4422    } finally {
4423      readUnlock();
4424    }
4425  }
4426
4427  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4428      ) throws AccessControlException, StandbyException {
4429    checkSuperuserPrivilege();
4430    checkOperation(OperationCategory.UNCHECKED);
4431    readLock();
4432    try {
4433      checkOperation(OperationCategory.UNCHECKED);
4434      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4435      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4436
4437      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4438      for (int i=0; i<arr.length; i++) {
4439        arr[i] = new DatanodeInfo(results.get(i));
4440      }
4441      return arr;
4442    } finally {
4443      readUnlock();
4444    }
4445  }
4446
4447  /**
4448   * Save namespace image.
4449   * This will save current namespace into fsimage file and empty edits file.
4450   * Requires superuser privilege and safe mode.
4451   * 
4452   * @throws AccessControlException if superuser privilege is violated.
4453   * @throws IOException if 
4454   */
4455  void saveNamespace() throws AccessControlException, IOException {
4456    checkOperation(OperationCategory.UNCHECKED);
4457    checkSuperuserPrivilege();
4458    
4459    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
4460    if (cacheEntry != null && cacheEntry.isSuccess()) {
4461      return; // Return previous response
4462    }
4463    boolean success = false;
4464    readLock();
4465    try {
4466      checkOperation(OperationCategory.UNCHECKED);
4467      if (!isInSafeMode()) {
4468        throw new IOException("Safe mode should be turned ON "
4469            + "in order to create namespace image.");
4470      }
4471      getFSImage().saveNamespace(this);
4472      success = true;
4473    } finally {
4474      readUnlock();
4475      RetryCache.setState(cacheEntry, success);
4476    }
4477    LOG.info("New namespace image has been created");
4478  }
4479  
4480  /**
4481   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4482   * Requires superuser privilege.
4483   * 
4484   * @throws AccessControlException if superuser privilege is violated.
4485   */
4486  boolean restoreFailedStorage(String arg) throws AccessControlException,
4487      StandbyException {
4488    checkSuperuserPrivilege();
4489    checkOperation(OperationCategory.UNCHECKED);
4490    writeLock();
4491    try {
4492      checkOperation(OperationCategory.UNCHECKED);
4493      
4494      // if it is disabled - enable it and vice versa.
4495      if(arg.equals("check"))
4496        return getFSImage().getStorage().getRestoreFailedStorage();
4497      
4498      boolean val = arg.equals("true");  // false if not
4499      getFSImage().getStorage().setRestoreFailedStorage(val);
4500      
4501      return val;
4502    } finally {
4503      writeUnlock();
4504    }
4505  }
4506
4507  Date getStartTime() {
4508    return new Date(startTime); 
4509  }
4510    
4511  void finalizeUpgrade() throws IOException {
4512    checkSuperuserPrivilege();
4513    checkOperation(OperationCategory.WRITE);
4514    writeLock();
4515    try {
4516      checkOperation(OperationCategory.WRITE);
4517      getFSImage().finalizeUpgrade();
4518    } finally {
4519      writeUnlock();
4520    }
4521  }
4522
4523  void refreshNodes() throws IOException {
4524    checkOperation(OperationCategory.UNCHECKED);
4525    checkSuperuserPrivilege();
4526    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
4527  }
4528
4529  void setBalancerBandwidth(long bandwidth) throws IOException {
4530    checkOperation(OperationCategory.UNCHECKED);
4531    checkSuperuserPrivilege();
4532    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
4533  }
4534
4535  /**
4536   * SafeModeInfo contains information related to the safe mode.
4537   * <p>
4538   * An instance of {@link SafeModeInfo} is created when the name node
4539   * enters safe mode.
4540   * <p>
4541   * During name node startup {@link SafeModeInfo} counts the number of
4542   * <em>safe blocks</em>, those that have at least the minimal number of
4543   * replicas, and calculates the ratio of safe blocks to the total number
4544   * of blocks in the system, which is the size of blocks in
4545   * {@link FSNamesystem#blockManager}. When the ratio reaches the
4546   * {@link #threshold} it starts the SafeModeMonitor daemon in order
4547   * to monitor whether the safe mode {@link #extension} is passed.
4548   * Then it leaves safe mode and destroys itself.
4549   * <p>
4550   * If safe mode is turned on manually then the number of safe blocks is
4551   * not tracked because the name node is not intended to leave safe mode
4552   * automatically in the case.
4553   *
4554   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
4555   */
4556  public class SafeModeInfo {
4557    // configuration fields
4558    /** Safe mode threshold condition %.*/
4559    private double threshold;
4560    /** Safe mode minimum number of datanodes alive */
4561    private int datanodeThreshold;
4562    /** Safe mode extension after the threshold. */
4563    private int extension;
4564    /** Min replication required by safe mode. */
4565    private int safeReplication;
4566    /** threshold for populating needed replication queues */
4567    private double replQueueThreshold;
4568      
4569    // internal fields
4570    /** Time when threshold was reached.
4571     * <br> -1 safe mode is off
4572     * <br> 0 safe mode is on, and threshold is not reached yet
4573     * <br> >0 safe mode is on, but we are in extension period 
4574     */
4575    private long reached = -1;  
4576    /** Total number of blocks. */
4577    int blockTotal; 
4578    /** Number of safe blocks. */
4579    int blockSafe;
4580    /** Number of blocks needed to satisfy safe mode threshold condition */
4581    private int blockThreshold;
4582    /** Number of blocks needed before populating replication queues */
4583    private int blockReplQueueThreshold;
4584    /** time of the last status printout */
4585    private long lastStatusReport = 0;
4586    /** flag indicating whether replication queues have been initialized */
4587    boolean initializedReplQueues = false;
4588    /** Was safemode entered automatically because available resources were low. */
4589    private boolean resourcesLow = false;
4590    /** Should safemode adjust its block totals as blocks come in */
4591    private boolean shouldIncrementallyTrackBlocks = false;
4592    /** counter for tracking startup progress of reported blocks */
4593    private Counter awaitingReportedBlocksCounter;
4594    
4595    /**
4596     * Creates SafeModeInfo when the name node enters
4597     * automatic safe mode at startup.
4598     *  
4599     * @param conf configuration
4600     */
4601    private SafeModeInfo(Configuration conf) {
4602      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
4603          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
4604      if(threshold > 1.0) {
4605        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
4606      }
4607      this.datanodeThreshold = conf.getInt(
4608        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
4609        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
4610      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
4611      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
4612                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
4613      
4614      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
4615      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
4616      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
4617
4618      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
4619      this.replQueueThreshold = 
4620        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
4621                      (float) threshold);
4622      this.blockTotal = 0; 
4623      this.blockSafe = 0;
4624    }
4625
4626    /**
4627     * In the HA case, the StandbyNode can be in safemode while the namespace
4628     * is modified by the edit log tailer. In this case, the number of total
4629     * blocks changes as edits are processed (eg blocks are added and deleted).
4630     * However, we don't want to do the incremental tracking during the
4631     * startup-time loading process -- only once the initial total has been
4632     * set after the image has been loaded.
4633     */
4634    private boolean shouldIncrementallyTrackBlocks() {
4635      return shouldIncrementallyTrackBlocks;
4636    }
4637
4638    /**
4639     * Creates SafeModeInfo when safe mode is entered manually, or because
4640     * available resources are low.
4641     *
4642     * The {@link #threshold} is set to 1.5 so that it could never be reached.
4643     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
4644     * 
4645     * @see SafeModeInfo
4646     */
4647    private SafeModeInfo(boolean resourcesLow, boolean isReplQueuesInited) {
4648      this.threshold = 1.5f;  // this threshold can never be reached
4649      this.datanodeThreshold = Integer.MAX_VALUE;
4650      this.extension = Integer.MAX_VALUE;
4651      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
4652      this.replQueueThreshold = 1.5f; // can never be reached
4653      this.blockTotal = -1;
4654      this.blockSafe = -1;
4655      this.resourcesLow = resourcesLow;
4656      this.initializedReplQueues = isReplQueuesInited;
4657      enter();
4658      reportStatus("STATE* Safe mode is ON.", true);
4659    }
4660      
4661    /**
4662     * Check if safe mode is on.
4663     * @return true if in safe mode
4664     */
4665    private synchronized boolean isOn() {
4666      doConsistencyCheck();
4667      return this.reached >= 0;
4668    }
4669      
4670    /**
4671     * Check if we are populating replication queues.
4672     */
4673    private synchronized boolean isPopulatingReplQueues() {
4674      return initializedReplQueues;
4675    }
4676
4677    /**
4678     * Enter safe mode.
4679     */
4680    private void enter() {
4681      this.reached = 0;
4682    }
4683      
4684    /**
4685     * Leave safe mode.
4686     * <p>
4687     * Check for invalid, under- & over-replicated blocks in the end of startup.
4688     */
4689    private synchronized void leave() {
4690      // if not done yet, initialize replication queues.
4691      // In the standby, do not populate repl queues
4692      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
4693        initializeReplQueues();
4694      }
4695      long timeInSafemode = now() - startTime;
4696      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
4697                                    + timeInSafemode/1000 + " secs");
4698      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
4699
4700      //Log the following only once (when transitioning from ON -> OFF)
4701      if (reached >= 0) {
4702        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
4703      }
4704      reached = -1;
4705      safeMode = null;
4706      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
4707      NameNode.stateChangeLog.info("STATE* Network topology has "
4708          + nt.getNumOfRacks() + " racks and "
4709          + nt.getNumOfLeaves() + " datanodes");
4710      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
4711          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
4712
4713      startSecretManagerIfNecessary();
4714
4715      // If startup has not yet completed, end safemode phase.
4716      StartupProgress prog = NameNode.getStartupProgress();
4717      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4718        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
4719        prog.endPhase(Phase.SAFEMODE);
4720      }
4721    }
4722
4723    /**
4724     * Initialize replication queues.
4725     */
4726    private synchronized void initializeReplQueues() {
4727      LOG.info("initializing replication queues");
4728      assert !isPopulatingReplQueues() : "Already initialized repl queues";
4729      long startTimeMisReplicatedScan = now();
4730      blockManager.processMisReplicatedBlocks();
4731      initializedReplQueues = true;
4732      NameNode.stateChangeLog.info("STATE* Replication Queue initialization "
4733          + "scan for invalid, over- and under-replicated blocks "
4734          + "completed in " + (now() - startTimeMisReplicatedScan)
4735          + " msec");
4736    }
4737
4738    /**
4739     * Check whether we have reached the threshold for 
4740     * initializing replication queues.
4741     */
4742    private synchronized boolean canInitializeReplQueues() {
4743      return shouldPopulateReplQueues()
4744          && blockSafe >= blockReplQueueThreshold;
4745    }
4746      
4747    /** 
4748     * Safe mode can be turned off iff 
4749     * the threshold is reached and 
4750     * the extension time have passed.
4751     * @return true if can leave or false otherwise.
4752     */
4753    private synchronized boolean canLeave() {
4754      if (reached == 0)
4755        return false;
4756      if (now() - reached < extension) {
4757        reportStatus("STATE* Safe mode ON.", false);
4758        return false;
4759      }
4760      return !needEnter();
4761    }
4762      
4763    /** 
4764     * There is no need to enter safe mode 
4765     * if DFS is empty or {@link #threshold} == 0
4766     */
4767    private boolean needEnter() {
4768      return (threshold != 0 && blockSafe < blockThreshold) ||
4769        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
4770        (!nameNodeHasResourcesAvailable());
4771    }
4772      
4773    /**
4774     * Check and trigger safe mode if needed. 
4775     */
4776    private void checkMode() {
4777      // Have to have write-lock since leaving safemode initializes
4778      // repl queues, which requires write lock
4779      assert hasWriteLock();
4780      // if smmthread is already running, the block threshold must have been 
4781      // reached before, there is no need to enter the safe mode again
4782      if (smmthread == null && needEnter()) {
4783        enter();
4784        // check if we are ready to initialize replication queues
4785        if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
4786          initializeReplQueues();
4787        }
4788        reportStatus("STATE* Safe mode ON.", false);
4789        return;
4790      }
4791      // the threshold is reached or was reached before
4792      if (!isOn() ||                           // safe mode is off
4793          extension <= 0 || threshold <= 0) {  // don't need to wait
4794        this.leave(); // leave safe mode
4795        return;
4796      }
4797      if (reached > 0) {  // threshold has already been reached before
4798        reportStatus("STATE* Safe mode ON.", false);
4799        return;
4800      }
4801      // start monitor
4802      reached = now();
4803      if (smmthread == null) {
4804        smmthread = new Daemon(new SafeModeMonitor());
4805        smmthread.start();
4806        reportStatus("STATE* Safe mode extension entered.", true);
4807      }
4808
4809      // check if we are ready to initialize replication queues
4810      if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
4811        initializeReplQueues();
4812      }
4813    }
4814      
4815    /**
4816     * Set total number of blocks.
4817     */
4818    private synchronized void setBlockTotal(int total) {
4819      this.blockTotal = total;
4820      this.blockThreshold = (int) (blockTotal * threshold);
4821      this.blockReplQueueThreshold = 
4822        (int) (blockTotal * replQueueThreshold);
4823      if (haEnabled) {
4824        // After we initialize the block count, any further namespace
4825        // modifications done while in safe mode need to keep track
4826        // of the number of total blocks in the system.
4827        this.shouldIncrementallyTrackBlocks = true;
4828      }
4829      if(blockSafe < 0)
4830        this.blockSafe = 0;
4831      checkMode();
4832    }
4833      
4834    /**
4835     * Increment number of safe blocks if current block has 
4836     * reached minimal replication.
4837     * @param replication current replication 
4838     */
4839    private synchronized void incrementSafeBlockCount(short replication) {
4840      if (replication == safeReplication) {
4841        this.blockSafe++;
4842
4843        // Report startup progress only if we haven't completed startup yet.
4844        StartupProgress prog = NameNode.getStartupProgress();
4845        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4846          if (this.awaitingReportedBlocksCounter == null) {
4847            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
4848              STEP_AWAITING_REPORTED_BLOCKS);
4849          }
4850          this.awaitingReportedBlocksCounter.increment();
4851        }
4852
4853        checkMode();
4854      }
4855    }
4856      
4857    /**
4858     * Decrement number of safe blocks if current block has 
4859     * fallen below minimal replication.
4860     * @param replication current replication 
4861     */
4862    private synchronized void decrementSafeBlockCount(short replication) {
4863      if (replication == safeReplication-1) {
4864        this.blockSafe--;
4865        //blockSafe is set to -1 in manual / low resources safemode
4866        assert blockSafe >= 0 || isManual() || areResourcesLow();
4867        checkMode();
4868      }
4869    }
4870
4871    /**
4872     * Check if safe mode was entered manually
4873     */
4874    private boolean isManual() {
4875      return extension == Integer.MAX_VALUE;
4876    }
4877
4878    /**
4879     * Set manual safe mode.
4880     */
4881    private synchronized void setManual() {
4882      extension = Integer.MAX_VALUE;
4883    }
4884
4885    /**
4886     * Check if safe mode was entered due to resources being low.
4887     */
4888    private boolean areResourcesLow() {
4889      return resourcesLow;
4890    }
4891
4892    /**
4893     * Set that resources are low for this instance of safe mode.
4894     */
4895    private void setResourcesLow() {
4896      resourcesLow = true;
4897    }
4898
4899    /**
4900     * A tip on how safe mode is to be turned off: manually or automatically.
4901     */
4902    String getTurnOffTip() {
4903      if(!isOn())
4904        return "Safe mode is OFF.";
4905
4906      //Manual OR low-resource safemode. (Admin intervention required)
4907      String leaveMsg = "It was turned on manually. ";
4908      if (areResourcesLow()) {
4909        leaveMsg = "Resources are low on NN. Please add or free up more "
4910          + "resources then turn off safe mode manually. NOTE:  If you turn off"
4911          + " safe mode before adding resources, "
4912          + "the NN will immediately return to safe mode. ";
4913      }
4914      if (isManual() || areResourcesLow()) {
4915        return leaveMsg
4916          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
4917      }
4918
4919      //Automatic safemode. System will come out of safemode automatically.
4920      leaveMsg = "Safe mode will be turned off automatically";
4921      int numLive = getNumLiveDataNodes();
4922      String msg = "";
4923      if (reached == 0) {
4924        if (blockSafe < blockThreshold) {
4925          msg += String.format(
4926            "The reported blocks %d needs additional %d"
4927            + " blocks to reach the threshold %.4f of total blocks %d.\n",
4928            blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
4929        }
4930        if (numLive < datanodeThreshold) {
4931          msg += String.format(
4932            "The number of live datanodes %d needs an additional %d live "
4933            + "datanodes to reach the minimum number %d.\n",
4934            numLive, (datanodeThreshold - numLive), datanodeThreshold);
4935        }
4936      } else {
4937        msg = String.format("The reported blocks %d has reached the threshold"
4938            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
4939
4940        msg += String.format("The number of live datanodes %d has reached "
4941                               + "the minimum number %d. ",
4942                               numLive, datanodeThreshold);
4943      }
4944      msg += leaveMsg;
4945      // threshold is not reached or manual or resources low
4946      if(reached == 0 || (isManual() && !areResourcesLow())) {
4947        return msg;
4948      }
4949      // extension period is in progress
4950      return msg + (reached + extension - now() > 0 ?
4951        " in " + (reached + extension - now()) / 1000 + " seconds."
4952        : " soon.");
4953    }
4954
4955    /**
4956     * Print status every 20 seconds.
4957     */
4958    private void reportStatus(String msg, boolean rightNow) {
4959      long curTime = now();
4960      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
4961        return;
4962      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
4963      lastStatusReport = curTime;
4964    }
4965
4966    @Override
4967    public String toString() {
4968      String resText = "Current safe blocks = " 
4969        + blockSafe 
4970        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
4971        + ". Minimal replication = " + safeReplication + ".";
4972      if (reached > 0) 
4973        resText += " Threshold was reached " + new Date(reached) + ".";
4974      return resText;
4975    }
4976      
4977    /**
4978     * Checks consistency of the class state.
4979     * This is costly so only runs if asserts are enabled.
4980     */
4981    private void doConsistencyCheck() {
4982      boolean assertsOn = false;
4983      assert assertsOn = true; // set to true if asserts are on
4984      if (!assertsOn) return;
4985      
4986      if (blockTotal == -1 && blockSafe == -1) {
4987        return; // manual safe mode
4988      }
4989      int activeBlocks = blockManager.getActiveBlockCount();
4990      if ((blockTotal != activeBlocks) &&
4991          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
4992        throw new AssertionError(
4993            " SafeMode: Inconsistent filesystem state: "
4994        + "SafeMode data: blockTotal=" + blockTotal
4995        + " blockSafe=" + blockSafe + "; "
4996        + "BlockManager data: active="  + activeBlocks);
4997      }
4998    }
4999
5000    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5001      if (!shouldIncrementallyTrackBlocks) {
5002        return;
5003      }
5004      assert haEnabled;
5005      
5006      if (LOG.isDebugEnabled()) {
5007        LOG.debug("Adjusting block totals from " +
5008            blockSafe + "/" + blockTotal + " to " +
5009            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5010      }
5011      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5012        blockSafe + " by " + deltaSafe + ": would be negative";
5013      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5014        blockTotal + " by " + deltaTotal + ": would be negative";
5015      
5016      blockSafe += deltaSafe;
5017      setBlockTotal(blockTotal + deltaTotal);
5018    }
5019  }
5020    
5021  /**
5022   * Periodically check whether it is time to leave safe mode.
5023   * This thread starts when the threshold level is reached.
5024   *
5025   */
5026  class SafeModeMonitor implements Runnable {
5027    /** interval in msec for checking safe mode: {@value} */
5028    private static final long recheckInterval = 1000;
5029      
5030    /**
5031     */
5032    @Override
5033    public void run() {
5034      while (fsRunning) {
5035        writeLock();
5036        try {
5037          if (safeMode == null) { // Not in safe mode.
5038            break;
5039          }
5040          if (safeMode.canLeave()) {
5041            // Leave safe mode.
5042            safeMode.leave();
5043            smmthread = null;
5044            break;
5045          }
5046        } finally {
5047          writeUnlock();
5048        }
5049
5050        try {
5051          Thread.sleep(recheckInterval);
5052        } catch (InterruptedException ie) {
5053          // Ignored
5054        }
5055      }
5056      if (!fsRunning) {
5057        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5058      }
5059    }
5060  }
5061    
5062  boolean setSafeMode(SafeModeAction action) throws IOException {
5063    if (action != SafeModeAction.SAFEMODE_GET) {
5064      checkSuperuserPrivilege();
5065      switch(action) {
5066      case SAFEMODE_LEAVE: // leave safe mode
5067        leaveSafeMode();
5068        break;
5069      case SAFEMODE_ENTER: // enter safe mode
5070        enterSafeMode(false);
5071        break;
5072      default:
5073        LOG.error("Unexpected safe mode action");
5074      }
5075    }
5076    return isInSafeMode();
5077  }
5078
5079  @Override
5080  public void checkSafeMode() {
5081    // safeMode is volatile, and may be set to null at any time
5082    SafeModeInfo safeMode = this.safeMode;
5083    if (safeMode != null) {
5084      safeMode.checkMode();
5085    }
5086  }
5087
5088  @Override
5089  public boolean isInSafeMode() {
5090    // safeMode is volatile, and may be set to null at any time
5091    SafeModeInfo safeMode = this.safeMode;
5092    if (safeMode == null)
5093      return false;
5094    return safeMode.isOn();
5095  }
5096
5097  @Override
5098  public boolean isInStartupSafeMode() {
5099    // safeMode is volatile, and may be set to null at any time
5100    SafeModeInfo safeMode = this.safeMode;
5101    if (safeMode == null)
5102      return false;
5103    // If the NN is in safemode, and not due to manual / low resources, we
5104    // assume it must be because of startup. If the NN had low resources during
5105    // startup, we assume it came out of startup safemode and it is now in low
5106    // resources safemode
5107    return !safeMode.isManual() && !safeMode.areResourcesLow()
5108      && safeMode.isOn();
5109  }
5110
5111  /**
5112   * Check if replication queues are to be populated
5113   * @return true when node is HAState.Active and not in the very first safemode
5114   */
5115  @Override
5116  public boolean isPopulatingReplQueues() {
5117    if (!shouldPopulateReplQueues()) {
5118      return false;
5119    }
5120    // safeMode is volatile, and may be set to null at any time
5121    SafeModeInfo safeMode = this.safeMode;
5122    if (safeMode == null)
5123      return true;
5124    return safeMode.isPopulatingReplQueues();
5125  }
5126
5127  private boolean shouldPopulateReplQueues() {
5128    if(haContext == null || haContext.getState() == null)
5129      return false;
5130    return haContext.getState().shouldPopulateReplQueues();
5131  }
5132
5133  @Override
5134  public void incrementSafeBlockCount(int replication) {
5135    // safeMode is volatile, and may be set to null at any time
5136    SafeModeInfo safeMode = this.safeMode;
5137    if (safeMode == null)
5138      return;
5139    safeMode.incrementSafeBlockCount((short)replication);
5140  }
5141
5142  @Override
5143  public void decrementSafeBlockCount(Block b) {
5144    // safeMode is volatile, and may be set to null at any time
5145    SafeModeInfo safeMode = this.safeMode;
5146    if (safeMode == null) // mostly true
5147      return;
5148    BlockInfo storedBlock = getStoredBlock(b);
5149    if (storedBlock.isComplete()) {
5150      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5151    }
5152  }
5153  
5154  /**
5155   * Adjust the total number of blocks safe and expected during safe mode.
5156   * If safe mode is not currently on, this is a no-op.
5157   * @param deltaSafe the change in number of safe blocks
5158   * @param deltaTotal the change i nnumber of total blocks expected
5159   */
5160  @Override
5161  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5162    // safeMode is volatile, and may be set to null at any time
5163    SafeModeInfo safeMode = this.safeMode;
5164    if (safeMode == null)
5165      return;
5166    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5167  }
5168
5169  /**
5170   * Set the total number of blocks in the system. 
5171   */
5172  public void setBlockTotal() {
5173    // safeMode is volatile, and may be set to null at any time
5174    SafeModeInfo safeMode = this.safeMode;
5175    if (safeMode == null)
5176      return;
5177    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5178  }
5179
5180  /**
5181   * Get the total number of blocks in the system. 
5182   */
5183  @Override // FSNamesystemMBean
5184  @Metric
5185  public long getBlocksTotal() {
5186    return blockManager.getTotalBlocks();
5187  }
5188
5189  /**
5190   * Get the total number of COMPLETE blocks in the system.
5191   * For safe mode only complete blocks are counted.
5192   */
5193  private long getCompleteBlocksTotal() {
5194    // Calculate number of blocks under construction
5195    long numUCBlocks = 0;
5196    readLock();
5197    try {
5198      for (Lease lease : leaseManager.getSortedLeases()) {
5199        for (String path : lease.getPaths()) {
5200          final INodeFileUnderConstruction cons;
5201          try {
5202            cons = INodeFileUnderConstruction.valueOf(dir.getINode(path), path);
5203          } catch (UnresolvedLinkException e) {
5204            throw new AssertionError("Lease files should reside on this FS");
5205          } catch (IOException e) {
5206            throw new RuntimeException(e);
5207          }
5208          BlockInfo[] blocks = cons.getBlocks();
5209          if(blocks == null)
5210            continue;
5211          for(BlockInfo b : blocks) {
5212            if(!b.isComplete())
5213              numUCBlocks++;
5214          }
5215        }
5216      }
5217      LOG.info("Number of blocks under construction: " + numUCBlocks);
5218      return getBlocksTotal() - numUCBlocks;
5219    } finally {
5220      readUnlock();
5221    }
5222  }
5223
5224  /**
5225   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5226   * @throws IOException
5227   */
5228  void enterSafeMode(boolean resourcesLow) throws IOException {
5229    writeLock();
5230    try {
5231      // Stop the secret manager, since rolling the master key would
5232      // try to write to the edit log
5233      stopSecretManager();
5234
5235      // Ensure that any concurrent operations have been fully synced
5236      // before entering safe mode. This ensures that the FSImage
5237      // is entirely stable on disk as soon as we're in safe mode.
5238      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5239      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5240      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5241      if (isEditlogOpenForWrite) {
5242        getEditLog().logSyncAll();
5243      }
5244      if (!isInSafeMode()) {
5245        safeMode = new SafeModeInfo(resourcesLow, isPopulatingReplQueues());
5246        return;
5247      }
5248      if (resourcesLow) {
5249        safeMode.setResourcesLow();
5250      } else {
5251        safeMode.setManual();
5252      }
5253      if (isEditlogOpenForWrite) {
5254        getEditLog().logSyncAll();
5255      }
5256      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5257          + safeMode.getTurnOffTip());
5258    } finally {
5259      writeUnlock();
5260    }
5261  }
5262
5263  /**
5264   * Leave safe mode.
5265   * @throws IOException
5266   */
5267  void leaveSafeMode() {
5268    writeLock();
5269    try {
5270      if (!isInSafeMode()) {
5271        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5272        return;
5273      }
5274      safeMode.leave();
5275    } finally {
5276      writeUnlock();
5277    }
5278  }
5279    
5280  String getSafeModeTip() {
5281    readLock();
5282    try {
5283      if (!isInSafeMode()) {
5284        return "";
5285      }
5286      return safeMode.getTurnOffTip();
5287    } finally {
5288      readUnlock();
5289    }
5290  }
5291
5292  CheckpointSignature rollEditLog() throws IOException {
5293    checkSuperuserPrivilege();
5294    checkOperation(OperationCategory.JOURNAL);
5295    writeLock();
5296    try {
5297      checkOperation(OperationCategory.JOURNAL);
5298      checkNameNodeSafeMode("Log not rolled");
5299      if (Server.isRpcInvocation()) {
5300        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5301      }
5302      return getFSImage().rollEditLog();
5303    } finally {
5304      writeUnlock();
5305    }
5306  }
5307
5308  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5309      NamenodeRegistration activeNamenode) throws IOException {
5310    checkOperation(OperationCategory.CHECKPOINT);
5311    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
5312        null);
5313    if (cacheEntry != null && cacheEntry.isSuccess()) {
5314      return (NamenodeCommand) cacheEntry.getPayload();
5315    }
5316    writeLock();
5317    NamenodeCommand cmd = null;
5318    try {
5319      checkOperation(OperationCategory.CHECKPOINT);
5320
5321      checkNameNodeSafeMode("Checkpoint not started");
5322      LOG.info("Start checkpoint for " + backupNode.getAddress());
5323      cmd = getFSImage().startCheckpoint(backupNode, activeNamenode);
5324      getEditLog().logSync();
5325      return cmd;
5326    } finally {
5327      writeUnlock();
5328      RetryCache.setState(cacheEntry, cmd != null, cmd);
5329    }
5330  }
5331
5332  public void processIncrementalBlockReport(final DatanodeID nodeID,
5333      final String poolId, final StorageReceivedDeletedBlocks srdb)
5334      throws IOException {
5335    writeLock();
5336    try {
5337      blockManager.processIncrementalBlockReport(nodeID, poolId, srdb);
5338    } finally {
5339      writeUnlock();
5340    }
5341  }
5342  
5343  void endCheckpoint(NamenodeRegistration registration,
5344                            CheckpointSignature sig) throws IOException {
5345    checkOperation(OperationCategory.CHECKPOINT);
5346    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5347    if (cacheEntry != null && cacheEntry.isSuccess()) {
5348      return; // Return previous response
5349    }
5350    boolean success = false;
5351    readLock();
5352    try {
5353      checkOperation(OperationCategory.CHECKPOINT);
5354
5355      checkNameNodeSafeMode("Checkpoint not ended");
5356      LOG.info("End checkpoint for " + registration.getAddress());
5357      getFSImage().endCheckpoint(sig);
5358      success = true;
5359    } finally {
5360      readUnlock();
5361      RetryCache.setState(cacheEntry, success);
5362    }
5363  }
5364
5365  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5366    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5367  }
5368
5369  private void checkOwner(FSPermissionChecker pc, String path)
5370      throws AccessControlException, UnresolvedLinkException {
5371    checkPermission(pc, path, true, null, null, null, null);
5372  }
5373
5374  private void checkPathAccess(FSPermissionChecker pc,
5375      String path, FsAction access) throws AccessControlException,
5376      UnresolvedLinkException {
5377    checkPermission(pc, path, false, null, null, access, null);
5378  }
5379
5380  private void checkParentAccess(FSPermissionChecker pc,
5381      String path, FsAction access) throws AccessControlException,
5382      UnresolvedLinkException {
5383    checkPermission(pc, path, false, null, access, null, null);
5384  }
5385
5386  private void checkAncestorAccess(FSPermissionChecker pc,
5387      String path, FsAction access) throws AccessControlException,
5388      UnresolvedLinkException {
5389    checkPermission(pc, path, false, access, null, null, null);
5390  }
5391
5392  private void checkTraverse(FSPermissionChecker pc, String path)
5393      throws AccessControlException, UnresolvedLinkException {
5394    checkPermission(pc, path, false, null, null, null, null);
5395  }
5396
5397  @Override
5398  public void checkSuperuserPrivilege()
5399      throws AccessControlException {
5400    if (isPermissionEnabled) {
5401      FSPermissionChecker pc = getPermissionChecker();
5402      pc.checkSuperuserPrivilege();
5403    }
5404  }
5405
5406  /**
5407   * Check whether current user have permissions to access the path. For more
5408   * details of the parameters, see
5409   * {@link FSPermissionChecker#checkPermission()}.
5410   */
5411  private void checkPermission(FSPermissionChecker pc,
5412      String path, boolean doCheckOwner, FsAction ancestorAccess,
5413      FsAction parentAccess, FsAction access, FsAction subAccess)
5414      throws AccessControlException, UnresolvedLinkException {
5415        checkPermission(pc, path, doCheckOwner, ancestorAccess,
5416            parentAccess, access, subAccess, true);
5417  }
5418
5419  /**
5420   * Check whether current user have permissions to access the path. For more
5421   * details of the parameters, see
5422   * {@link FSPermissionChecker#checkPermission()}.
5423   */
5424  private void checkPermission(FSPermissionChecker pc,
5425      String path, boolean doCheckOwner, FsAction ancestorAccess,
5426      FsAction parentAccess, FsAction access, FsAction subAccess,
5427      boolean resolveLink)
5428      throws AccessControlException, UnresolvedLinkException {
5429    if (!pc.isSuperUser()) {
5430      dir.waitForReady();
5431      readLock();
5432      try {
5433        pc.checkPermission(path, dir.rootDir, doCheckOwner, ancestorAccess,
5434            parentAccess, access, subAccess, resolveLink);
5435      } finally {
5436        readUnlock();
5437      }
5438    }
5439  }
5440  
5441  /**
5442   * Check to see if we have exceeded the limit on the number
5443   * of inodes.
5444   */
5445  void checkFsObjectLimit() throws IOException {
5446    if (maxFsObjects != 0 &&
5447        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5448      throw new IOException("Exceeded the configured number of objects " +
5449                             maxFsObjects + " in the filesystem.");
5450    }
5451  }
5452
5453  /**
5454   * Get the total number of objects in the system. 
5455   */
5456  @Override // FSNamesystemMBean
5457  public long getMaxObjects() {
5458    return maxFsObjects;
5459  }
5460
5461  @Override // FSNamesystemMBean
5462  @Metric
5463  public long getFilesTotal() {
5464    readLock();
5465    try {
5466      return this.dir.totalInodes();
5467    } finally {
5468      readUnlock();
5469    }
5470  }
5471
5472  @Override // FSNamesystemMBean
5473  @Metric
5474  public long getPendingReplicationBlocks() {
5475    return blockManager.getPendingReplicationBlocksCount();
5476  }
5477
5478  @Override // FSNamesystemMBean
5479  @Metric
5480  public long getUnderReplicatedBlocks() {
5481    return blockManager.getUnderReplicatedBlocksCount();
5482  }
5483
5484  /** Returns number of blocks with corrupt replicas */
5485  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5486  public long getCorruptReplicaBlocks() {
5487    return blockManager.getCorruptReplicaBlocksCount();
5488  }
5489
5490  @Override // FSNamesystemMBean
5491  @Metric
5492  public long getScheduledReplicationBlocks() {
5493    return blockManager.getScheduledReplicationBlocksCount();
5494  }
5495
5496  @Metric
5497  public long getPendingDeletionBlocks() {
5498    return blockManager.getPendingDeletionBlocksCount();
5499  }
5500
5501  @Metric
5502  public long getExcessBlocks() {
5503    return blockManager.getExcessBlocksCount();
5504  }
5505  
5506  // HA-only metric
5507  @Metric
5508  public long getPostponedMisreplicatedBlocks() {
5509    return blockManager.getPostponedMisreplicatedBlocksCount();
5510  }
5511
5512  // HA-only metric
5513  @Metric
5514  public int getPendingDataNodeMessageCount() {
5515    return blockManager.getPendingDataNodeMessageCount();
5516  }
5517  
5518  // HA-only metric
5519  @Metric
5520  public String getHAState() {
5521    return haContext.getState().toString();
5522  }
5523
5524  // HA-only metric
5525  @Metric
5526  public long getMillisSinceLastLoadedEdits() {
5527    if (isInStandbyState() && editLogTailer != null) {
5528      return now() - editLogTailer.getLastLoadTimestamp();
5529    } else {
5530      return 0;
5531    }
5532  }
5533  
5534  @Metric
5535  public int getBlockCapacity() {
5536    return blockManager.getCapacity();
5537  }
5538
5539  @Override // FSNamesystemMBean
5540  public String getFSState() {
5541    return isInSafeMode() ? "safeMode" : "Operational";
5542  }
5543  
5544  private ObjectName mbeanName;
5545  private ObjectName mxbeanName;
5546
5547  /**
5548   * Register the FSNamesystem MBean using the name
5549   *        "hadoop:service=NameNode,name=FSNamesystemState"
5550   */
5551  private void registerMBean() {
5552    // We can only implement one MXBean interface, so we keep the old one.
5553    try {
5554      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
5555      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
5556    } catch (NotCompliantMBeanException e) {
5557      throw new RuntimeException("Bad MBean setup", e);
5558    }
5559
5560    LOG.info("Registered FSNamesystemState MBean");
5561  }
5562
5563  /**
5564   * shutdown FSNamesystem
5565   */
5566  void shutdown() {
5567    if (mbeanName != null) {
5568      MBeans.unregister(mbeanName);
5569      mbeanName = null;
5570    }
5571    if (mxbeanName != null) {
5572      MBeans.unregister(mxbeanName);
5573      mxbeanName = null;
5574    }
5575    if (dir != null) {
5576      dir.shutdown();
5577    }
5578    if (blockManager != null) {
5579      blockManager.shutdown();
5580    }
5581  }
5582  
5583
5584  @Override // FSNamesystemMBean
5585  public int getNumLiveDataNodes() {
5586    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
5587  }
5588
5589  @Override // FSNamesystemMBean
5590  public int getNumDeadDataNodes() {
5591    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
5592  }
5593  
5594  @Override // FSNamesystemMBean
5595  public int getNumDecomLiveDataNodes() {
5596    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
5597    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
5598    int liveDecommissioned = 0;
5599    for (DatanodeDescriptor node : live) {
5600      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
5601    }
5602    return liveDecommissioned;
5603  }
5604
5605  @Override // FSNamesystemMBean
5606  public int getNumDecomDeadDataNodes() {
5607    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
5608    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
5609    int deadDecommissioned = 0;
5610    for (DatanodeDescriptor node : dead) {
5611      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
5612    }
5613    return deadDecommissioned;
5614  }
5615
5616  @Override // FSNamesystemMBean
5617  public int getNumDecommissioningDataNodes() {
5618    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
5619        .size();
5620  }
5621
5622  @Override // FSNamesystemMBean
5623  @Metric({"StaleDataNodes", 
5624    "Number of datanodes marked stale due to delayed heartbeat"})
5625  public int getNumStaleDataNodes() {
5626    return getBlockManager().getDatanodeManager().getNumStaleNodes();
5627  }
5628
5629  /**
5630   * Sets the current generation stamp for legacy blocks
5631   */
5632  void setGenerationStampV1(long stamp) {
5633    generationStampV1.setCurrentValue(stamp);
5634  }
5635
5636  /**
5637   * Gets the current generation stamp for legacy blocks
5638   */
5639  long getGenerationStampV1() {
5640    return generationStampV1.getCurrentValue();
5641  }
5642
5643  /**
5644   * Gets the current generation stamp for this filesystem
5645   */
5646  void setGenerationStampV2(long stamp) {
5647    generationStampV2.setCurrentValue(stamp);
5648  }
5649
5650  /**
5651   * Gets the current generation stamp for this filesystem
5652   */
5653  long getGenerationStampV2() {
5654    return generationStampV2.getCurrentValue();
5655  }
5656
5657  /**
5658   * Upgrades the generation stamp for the filesystem
5659   * by reserving a sufficient range for all existing blocks.
5660   * Should be invoked only during the first upgrade to
5661   * sequential block IDs.
5662   */
5663  long upgradeGenerationStampToV2() {
5664    Preconditions.checkState(generationStampV2.getCurrentValue() ==
5665        GenerationStamp.LAST_RESERVED_STAMP);
5666
5667    generationStampV2.skipTo(
5668        generationStampV1.getCurrentValue() +
5669        HdfsConstants.RESERVED_GENERATION_STAMPS_V1);
5670
5671    generationStampV1Limit = generationStampV2.getCurrentValue();
5672    return generationStampV2.getCurrentValue();
5673  }
5674
5675  /**
5676   * Sets the generation stamp that delineates random and sequentially
5677   * allocated block IDs.
5678   * @param stamp
5679   */
5680  void setGenerationStampV1Limit(long stamp) {
5681    Preconditions.checkState(generationStampV1Limit ==
5682                             GenerationStamp.GRANDFATHER_GENERATION_STAMP);
5683    generationStampV1Limit = stamp;
5684  }
5685
5686  /**
5687   * Gets the value of the generation stamp that delineates sequential
5688   * and random block IDs.
5689   */
5690  long getGenerationStampAtblockIdSwitch() {
5691    return generationStampV1Limit;
5692  }
5693
5694  @VisibleForTesting
5695  SequentialBlockIdGenerator getBlockIdGenerator() {
5696    return blockIdGenerator;
5697  }
5698
5699  /**
5700   * Sets the maximum allocated block ID for this filesystem. This is
5701   * the basis for allocating new block IDs.
5702   */
5703  void setLastAllocatedBlockId(long blockId) {
5704    blockIdGenerator.skipTo(blockId);
5705  }
5706
5707  /**
5708   * Gets the maximum sequentially allocated block ID for this filesystem
5709   */
5710  long getLastAllocatedBlockId() {
5711    return blockIdGenerator.getCurrentValue();
5712  }
5713
5714  /**
5715   * Increments, logs and then returns the stamp
5716   */
5717  long nextGenerationStamp(boolean legacyBlock)
5718      throws IOException, SafeModeException {
5719    assert hasWriteLock();
5720    checkNameNodeSafeMode("Cannot get next generation stamp");
5721
5722    long gs;
5723    if (legacyBlock) {
5724      gs = getNextGenerationStampV1();
5725      getEditLog().logGenerationStampV1(gs);
5726    } else {
5727      gs = getNextGenerationStampV2();
5728      getEditLog().logGenerationStampV2(gs);
5729    }
5730
5731    // NB: callers sync the log
5732    return gs;
5733  }
5734
5735  @VisibleForTesting
5736  long getNextGenerationStampV1() throws IOException {
5737    long genStampV1 = generationStampV1.nextValue();
5738
5739    if (genStampV1 >= generationStampV1Limit) {
5740      // We ran out of generation stamps for legacy blocks. In practice, it
5741      // is extremely unlikely as we reserved 1T v1 generation stamps. The
5742      // result is that we can no longer append to the legacy blocks that
5743      // were created before the upgrade to sequential block IDs.
5744      throw new OutOfV1GenerationStampsException();
5745    }
5746
5747    return genStampV1;
5748  }
5749
5750  @VisibleForTesting
5751  long getNextGenerationStampV2() {
5752    return generationStampV2.nextValue();
5753  }
5754
5755  long getGenerationStampV1Limit() {
5756    return generationStampV1Limit;
5757  }
5758
5759  /**
5760   * Determine whether the block ID was randomly generated (legacy) or
5761   * sequentially generated. The generation stamp value is used to
5762   * make the distinction.
5763   * @param block
5764   * @return true if the block ID was randomly generated, false otherwise.
5765   */
5766  boolean isLegacyBlock(Block block) {
5767    return block.getGenerationStamp() < getGenerationStampV1Limit();
5768  }
5769
5770  /**
5771   * Increments, logs and then returns the block ID
5772   */
5773  private long nextBlockId() throws IOException {
5774    assert hasWriteLock();
5775    checkNameNodeSafeMode("Cannot get next block ID");
5776    final long blockId = blockIdGenerator.nextValue();
5777    getEditLog().logAllocateBlockId(blockId);
5778    // NB: callers sync the log
5779    return blockId;
5780  }
5781
5782  private INodeFileUnderConstruction checkUCBlock(ExtendedBlock block,
5783      String clientName) throws IOException {
5784    assert hasWriteLock();
5785    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
5786        + "access token for block " + block);
5787    
5788    // check stored block state
5789    BlockInfo storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
5790    if (storedBlock == null || 
5791        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
5792        throw new IOException(block + 
5793            " does not exist or is not under Construction" + storedBlock);
5794    }
5795    
5796    // check file inode
5797    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
5798    if (file==null || !file.isUnderConstruction()) {
5799      throw new IOException("The file " + storedBlock + 
5800          " belonged to does not exist or it is not under construction.");
5801    }
5802    
5803    // check lease
5804    INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)file;
5805    if (clientName == null || !clientName.equals(pendingFile.getClientName())) {
5806      throw new LeaseExpiredException("Lease mismatch: " + block + 
5807          " is accessed by a non lease holder " + clientName); 
5808    }
5809
5810    return pendingFile;
5811  }
5812  
5813  /**
5814   * Client is reporting some bad block locations.
5815   */
5816  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
5817    checkOperation(OperationCategory.WRITE);
5818    NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
5819    writeLock();
5820    try {
5821      checkOperation(OperationCategory.WRITE);
5822      for (int i = 0; i < blocks.length; i++) {
5823        ExtendedBlock blk = blocks[i].getBlock();
5824        DatanodeInfo[] nodes = blocks[i].getLocations();
5825        String[] storageIDs = blocks[i].getStorageIDs();
5826        for (int j = 0; j < nodes.length; j++) {
5827          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
5828              storageIDs == null ? null: storageIDs[j], 
5829              "client machine reported it");
5830        }
5831      }
5832    } finally {
5833      writeUnlock();
5834    }
5835  }
5836
5837  /**
5838   * Get a new generation stamp together with an access token for 
5839   * a block under construction
5840   * 
5841   * This method is called for recovering a failed pipeline or setting up
5842   * a pipeline to append to a block.
5843   * 
5844   * @param block a block
5845   * @param clientName the name of a client
5846   * @return a located block with a new generation stamp and an access token
5847   * @throws IOException if any error occurs
5848   */
5849  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
5850      String clientName) throws IOException {
5851    LocatedBlock locatedBlock;
5852    checkOperation(OperationCategory.WRITE);
5853    writeLock();
5854    try {
5855      checkOperation(OperationCategory.WRITE);
5856
5857      // check vadility of parameters
5858      checkUCBlock(block, clientName);
5859  
5860      // get a new generation stamp and an access token
5861      block.setGenerationStamp(
5862          nextGenerationStamp(isLegacyBlock(block.getLocalBlock())));
5863      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
5864      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
5865    } finally {
5866      writeUnlock();
5867    }
5868    // Ensure we record the new generation stamp
5869    getEditLog().logSync();
5870    return locatedBlock;
5871  }
5872  
5873  /**
5874   * Update a pipeline for a block under construction
5875   * 
5876   * @param clientName the name of the client
5877   * @param oldBlock and old block
5878   * @param newBlock a new block with a new generation stamp and length
5879   * @param newNodes datanodes in the pipeline
5880   * @throws IOException if any error occurs
5881   */
5882  void updatePipeline(String clientName, ExtendedBlock oldBlock, 
5883      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs)
5884      throws IOException {
5885    checkOperation(OperationCategory.WRITE);
5886    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5887    if (cacheEntry != null && cacheEntry.isSuccess()) {
5888      return; // Return previous response
5889    }
5890    LOG.info("updatePipeline(block=" + oldBlock
5891             + ", newGenerationStamp=" + newBlock.getGenerationStamp()
5892             + ", newLength=" + newBlock.getNumBytes()
5893             + ", newNodes=" + Arrays.asList(newNodes)
5894             + ", clientName=" + clientName
5895             + ")");
5896    writeLock();
5897    boolean success = false;
5898    try {
5899      checkOperation(OperationCategory.WRITE);
5900      checkNameNodeSafeMode("Pipeline not updated");
5901      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
5902        + oldBlock + " has different block identifier";
5903      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
5904          newStorageIDs, cacheEntry != null);
5905      success = true;
5906    } finally {
5907      writeUnlock();
5908      RetryCache.setState(cacheEntry, success);
5909    }
5910    getEditLog().logSync();
5911    LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock);
5912  }
5913
5914  /** @see #updatePipeline(String, ExtendedBlock, ExtendedBlock, DatanodeID[]) */
5915  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, 
5916      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
5917      boolean logRetryCache)
5918      throws IOException {
5919    assert hasWriteLock();
5920    // check the vadility of the block and lease holder name
5921    final INodeFileUnderConstruction pendingFile
5922        = checkUCBlock(oldBlock, clientName);
5923    final BlockInfoUnderConstruction blockinfo
5924        = (BlockInfoUnderConstruction)pendingFile.getLastBlock();
5925
5926    // check new GS & length: this is not expected
5927    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
5928        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
5929      String msg = "Update " + oldBlock + " (len = " + 
5930        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
5931        " (len = " + newBlock.getNumBytes() +")";
5932      LOG.warn(msg);
5933      throw new IOException(msg);
5934    }
5935
5936    // Update old block with the new generation stamp and new length
5937    blockinfo.setNumBytes(newBlock.getNumBytes());
5938    blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
5939
5940    // find the DatanodeDescriptor objects
5941    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
5942        .getDatanodeStorageInfos(newNodes, newStorageIDs);
5943    blockinfo.setExpectedLocations(storages);
5944
5945    String src = leaseManager.findPath(pendingFile);
5946    dir.persistBlocks(src, pendingFile, logRetryCache);
5947  }
5948
5949  // rename was successful. If any part of the renamed subtree had
5950  // files that were being written to, update with new filename.
5951  void unprotectedChangeLease(String src, String dst) {
5952    assert hasWriteLock();
5953    leaseManager.changeLease(src, dst);
5954  }
5955
5956  /**
5957   * Serializes leases. 
5958   */
5959  void saveFilesUnderConstruction(DataOutputStream out,
5960      Map<Long, INodeFileUnderConstruction> snapshotUCMap) throws IOException {
5961    // This is run by an inferior thread of saveNamespace, which holds a read
5962    // lock on our behalf. If we took the read lock here, we could block
5963    // for fairness if a writer is waiting on the lock.
5964    synchronized (leaseManager) {
5965      Map<String, INodeFileUnderConstruction> nodes =
5966          leaseManager.getINodesUnderConstruction();
5967      for (Map.Entry<String, INodeFileUnderConstruction> entry
5968          : nodes.entrySet()) {
5969        // TODO: for HDFS-5428, because of rename operations, some
5970        // under-construction files that are
5971        // in the current fs directory can also be captured in the
5972        // snapshotUCMap. We should remove them from the snapshotUCMap.
5973        snapshotUCMap.remove(entry.getValue().getId());
5974      }
5975      
5976      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size    
5977      for (Map.Entry<String, INodeFileUnderConstruction> entry
5978           : nodes.entrySet()) {
5979        FSImageSerialization.writeINodeUnderConstruction(
5980            out, entry.getValue(), entry.getKey());
5981      }
5982      for (Map.Entry<Long, INodeFileUnderConstruction> entry
5983          : snapshotUCMap.entrySet()) {
5984        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
5985        // as their paths
5986        StringBuilder b = new StringBuilder();
5987        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
5988            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
5989            .append(Path.SEPARATOR).append(entry.getValue().getId());
5990        FSImageSerialization.writeINodeUnderConstruction(
5991            out, entry.getValue(), b.toString());
5992      }
5993    }
5994  }
5995
5996  /**
5997   * Register a Backup name-node, verifying that it belongs
5998   * to the correct namespace, and adding it to the set of
5999   * active journals if necessary.
6000   * 
6001   * @param bnReg registration of the new BackupNode
6002   * @param nnReg registration of this NameNode
6003   * @throws IOException if the namespace IDs do not match
6004   */
6005  void registerBackupNode(NamenodeRegistration bnReg,
6006      NamenodeRegistration nnReg) throws IOException {
6007    writeLock();
6008    try {
6009      if(getFSImage().getStorage().getNamespaceID() 
6010         != bnReg.getNamespaceID())
6011        throw new IOException("Incompatible namespaceIDs: "
6012            + " Namenode namespaceID = "
6013            + getFSImage().getStorage().getNamespaceID() + "; "
6014            + bnReg.getRole() +
6015            " node namespaceID = " + bnReg.getNamespaceID());
6016      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6017        getFSImage().getEditLog().registerBackupNode(
6018            bnReg, nnReg);
6019      }
6020    } finally {
6021      writeUnlock();
6022    }
6023  }
6024
6025  /**
6026   * Release (unregister) backup node.
6027   * <p>
6028   * Find and remove the backup stream corresponding to the node.
6029   * @param registration
6030   * @throws IOException
6031   */
6032  void releaseBackupNode(NamenodeRegistration registration)
6033    throws IOException {
6034    checkOperation(OperationCategory.WRITE);
6035    writeLock();
6036    try {
6037      checkOperation(OperationCategory.WRITE);
6038      if(getFSImage().getStorage().getNamespaceID()
6039         != registration.getNamespaceID())
6040        throw new IOException("Incompatible namespaceIDs: "
6041            + " Namenode namespaceID = "
6042            + getFSImage().getStorage().getNamespaceID() + "; "
6043            + registration.getRole() +
6044            " node namespaceID = " + registration.getNamespaceID());
6045      getEditLog().releaseBackupStream(registration);
6046    } finally {
6047      writeUnlock();
6048    }
6049  }
6050
6051  static class CorruptFileBlockInfo {
6052    String path;
6053    Block block;
6054    
6055    public CorruptFileBlockInfo(String p, Block b) {
6056      path = p;
6057      block = b;
6058    }
6059    
6060    @Override
6061    public String toString() {
6062      return block.getBlockName() + "\t" + path;
6063    }
6064  }
6065  /**
6066   * @param path Restrict corrupt files to this portion of namespace.
6067   * @param startBlockAfter Support for continuation; the set of files we return
6068   *  back is ordered by blockid; startBlockAfter tells where to start from
6069   * @return a list in which each entry describes a corrupt file/block
6070   * @throws AccessControlException
6071   * @throws IOException
6072   */
6073  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6074  String[] cookieTab) throws IOException {
6075    checkSuperuserPrivilege();
6076    checkOperation(OperationCategory.READ);
6077    readLock();
6078    try {
6079      checkOperation(OperationCategory.READ);
6080      if (!isPopulatingReplQueues()) {
6081        throw new IOException("Cannot run listCorruptFileBlocks because " +
6082                              "replication queues have not been initialized.");
6083      }
6084      // print a limited # of corrupt files per call
6085      int count = 0;
6086      ArrayList<CorruptFileBlockInfo> corruptFiles = new ArrayList<CorruptFileBlockInfo>();
6087
6088      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6089
6090      if (cookieTab == null) {
6091        cookieTab = new String[] { null };
6092      }
6093      int skip = getIntCookie(cookieTab[0]);
6094      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6095        blkIterator.next();
6096      }
6097
6098      while (blkIterator.hasNext()) {
6099        Block blk = blkIterator.next();
6100        final INode inode = (INode)blockManager.getBlockCollection(blk);
6101        skip++;
6102        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6103          String src = FSDirectory.getFullPathName(inode);
6104          if (src.startsWith(path)){
6105            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6106            count++;
6107            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6108              break;
6109          }
6110        }
6111      }
6112      cookieTab[0] = String.valueOf(skip);
6113      LOG.info("list corrupt file blocks returned: " + count);
6114      return corruptFiles;
6115    } finally {
6116      readUnlock();
6117    }
6118  }
6119
6120  /**
6121   * Convert string cookie to integer.
6122   */
6123  private static int getIntCookie(String cookie){
6124    int c;
6125    if(cookie == null){
6126      c = 0;
6127    } else {
6128      try{
6129        c = Integer.parseInt(cookie);
6130      }catch (NumberFormatException e) {
6131        c = 0;
6132      }
6133    }
6134    c = Math.max(0, c);
6135    return c;
6136  }
6137
6138  /**
6139   * Create delegation token secret manager
6140   */
6141  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6142      Configuration conf) {
6143    return new DelegationTokenSecretManager(conf.getLong(
6144        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6145        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6146        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6147            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6148        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6149            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6150        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6151        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6152            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6153        this);
6154  }
6155
6156  /**
6157   * Returns the DelegationTokenSecretManager instance in the namesystem.
6158   * @return delegation token secret manager object
6159   */
6160  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6161    return dtSecretManager;
6162  }
6163
6164  /**
6165   * @param renewer
6166   * @return Token<DelegationTokenIdentifier>
6167   * @throws IOException
6168   */
6169  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6170      throws IOException {
6171    Token<DelegationTokenIdentifier> token;
6172    checkOperation(OperationCategory.WRITE);
6173    writeLock();
6174    try {
6175      checkOperation(OperationCategory.WRITE);
6176      checkNameNodeSafeMode("Cannot issue delegation token");
6177      if (!isAllowedDelegationTokenOp()) {
6178        throw new IOException(
6179          "Delegation Token can be issued only with kerberos or web authentication");
6180      }
6181      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6182        LOG.warn("trying to get DT with no secret manager running");
6183        return null;
6184      }
6185
6186      UserGroupInformation ugi = getRemoteUser();
6187      String user = ugi.getUserName();
6188      Text owner = new Text(user);
6189      Text realUser = null;
6190      if (ugi.getRealUser() != null) {
6191        realUser = new Text(ugi.getRealUser().getUserName());
6192      }
6193      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6194        renewer, realUser);
6195      token = new Token<DelegationTokenIdentifier>(
6196        dtId, dtSecretManager);
6197      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6198      getEditLog().logGetDelegationToken(dtId, expiryTime);
6199    } finally {
6200      writeUnlock();
6201    }
6202    getEditLog().logSync();
6203    return token;
6204  }
6205
6206  /**
6207   * 
6208   * @param token
6209   * @return New expiryTime of the token
6210   * @throws InvalidToken
6211   * @throws IOException
6212   */
6213  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6214      throws InvalidToken, IOException {
6215    long expiryTime;
6216    checkOperation(OperationCategory.WRITE);
6217    writeLock();
6218    try {
6219      checkOperation(OperationCategory.WRITE);
6220
6221      checkNameNodeSafeMode("Cannot renew delegation token");
6222      if (!isAllowedDelegationTokenOp()) {
6223        throw new IOException(
6224            "Delegation Token can be renewed only with kerberos or web authentication");
6225      }
6226      String renewer = getRemoteUser().getShortUserName();
6227      expiryTime = dtSecretManager.renewToken(token, renewer);
6228      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6229      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6230      DataInputStream in = new DataInputStream(buf);
6231      id.readFields(in);
6232      getEditLog().logRenewDelegationToken(id, expiryTime);
6233    } finally {
6234      writeUnlock();
6235    }
6236    getEditLog().logSync();
6237    return expiryTime;
6238  }
6239
6240  /**
6241   * 
6242   * @param token
6243   * @throws IOException
6244   */
6245  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6246      throws IOException {
6247    checkOperation(OperationCategory.WRITE);
6248    writeLock();
6249    try {
6250      checkOperation(OperationCategory.WRITE);
6251
6252      checkNameNodeSafeMode("Cannot cancel delegation token");
6253      String canceller = getRemoteUser().getUserName();
6254      DelegationTokenIdentifier id = dtSecretManager
6255        .cancelToken(token, canceller);
6256      getEditLog().logCancelDelegationToken(id);
6257    } finally {
6258      writeUnlock();
6259    }
6260    getEditLog().logSync();
6261  }
6262  
6263  /**
6264   * @param out save state of the secret manager
6265   * @param sdPath String storage directory path
6266   */
6267  void saveSecretManagerState(DataOutputStream out, String sdPath)
6268      throws IOException {
6269    dtSecretManager.saveSecretManagerState(out, sdPath);
6270  }
6271
6272  /**
6273   * @param in load the state of secret manager from input stream
6274   */
6275  void loadSecretManagerState(DataInput in) throws IOException {
6276    dtSecretManager.loadSecretManagerState(in);
6277  }
6278
6279  /**
6280   * Log the updateMasterKey operation to edit logs
6281   * 
6282   * @param key new delegation key.
6283   */
6284  public void logUpdateMasterKey(DelegationKey key) {
6285    
6286    assert !isInSafeMode() :
6287      "this should never be called while in safemode, since we stop " +
6288      "the DT manager before entering safemode!";
6289    // No need to hold FSN lock since we don't access any internal
6290    // structures, and this is stopped before the FSN shuts itself
6291    // down, etc.
6292    getEditLog().logUpdateMasterKey(key);
6293    getEditLog().logSync();
6294  }
6295  
6296  /**
6297   * Log the cancellation of expired tokens to edit logs
6298   * 
6299   * @param id token identifier to cancel
6300   */
6301  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6302    assert !isInSafeMode() :
6303      "this should never be called while in safemode, since we stop " +
6304      "the DT manager before entering safemode!";
6305    // No need to hold FSN lock since we don't access any internal
6306    // structures, and this is stopped before the FSN shuts itself
6307    // down, etc.
6308    getEditLog().logCancelDelegationToken(id);
6309  }  
6310  
6311  private void logReassignLease(String leaseHolder, String src,
6312      String newHolder) {
6313    assert hasWriteLock();
6314    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6315  }
6316  
6317  /**
6318   * 
6319   * @return true if delegation token operation is allowed
6320   */
6321  private boolean isAllowedDelegationTokenOp() throws IOException {
6322    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6323    if (UserGroupInformation.isSecurityEnabled()
6324        && (authMethod != AuthenticationMethod.KERBEROS)
6325        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6326        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6327      return false;
6328    }
6329    return true;
6330  }
6331  
6332  /**
6333   * Returns authentication method used to establish the connection
6334   * @return AuthenticationMethod used to establish connection
6335   * @throws IOException
6336   */
6337  private AuthenticationMethod getConnectionAuthenticationMethod()
6338      throws IOException {
6339    UserGroupInformation ugi = getRemoteUser();
6340    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6341    if (authMethod == AuthenticationMethod.PROXY) {
6342      authMethod = ugi.getRealUser().getAuthenticationMethod();
6343    }
6344    return authMethod;
6345  }
6346  
6347  /**
6348   * Client invoked methods are invoked over RPC and will be in 
6349   * RPC call context even if the client exits.
6350   */
6351  private boolean isExternalInvocation() {
6352    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6353  }
6354
6355  private static InetAddress getRemoteIp() {
6356    InetAddress ip = Server.getRemoteIp();
6357    if (ip != null) {
6358      return ip;
6359    }
6360    return NamenodeWebHdfsMethods.getRemoteIp();
6361  }
6362  
6363  // optimize ugi lookup for RPC operations to avoid a trip through
6364  // UGI.getCurrentUser which is synch'ed
6365  private static UserGroupInformation getRemoteUser() throws IOException {
6366    return NameNode.getRemoteUser();
6367  }
6368  
6369  /**
6370   * Log fsck event in the audit log 
6371   */
6372  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6373    if (isAuditEnabled()) {
6374      logAuditEvent(true, getRemoteUser(),
6375                    remoteAddress,
6376                    "fsck", src, null, null);
6377    }
6378  }
6379  /**
6380   * Register NameNodeMXBean
6381   */
6382  private void registerMXBean() {
6383    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6384  }
6385
6386  /**
6387   * Class representing Namenode information for JMX interfaces
6388   */
6389  @Override // NameNodeMXBean
6390  public String getVersion() {
6391    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6392  }
6393
6394  @Override // NameNodeMXBean
6395  public long getUsed() {
6396    return this.getCapacityUsed();
6397  }
6398
6399  @Override // NameNodeMXBean
6400  public long getFree() {
6401    return this.getCapacityRemaining();
6402  }
6403
6404  @Override // NameNodeMXBean
6405  public long getTotal() {
6406    return this.getCapacityTotal();
6407  }
6408
6409  @Override // NameNodeMXBean
6410  public String getSafemode() {
6411    if (!this.isInSafeMode())
6412      return "";
6413    return "Safe mode is ON. " + this.getSafeModeTip();
6414  }
6415
6416  @Override // NameNodeMXBean
6417  public boolean isUpgradeFinalized() {
6418    return this.getFSImage().isUpgradeFinalized();
6419  }
6420
6421  @Override // NameNodeMXBean
6422  public long getNonDfsUsedSpace() {
6423    return datanodeStatistics.getCapacityUsedNonDFS();
6424  }
6425
6426  @Override // NameNodeMXBean
6427  public float getPercentUsed() {
6428    return datanodeStatistics.getCapacityUsedPercent();
6429  }
6430
6431  @Override // NameNodeMXBean
6432  public long getBlockPoolUsedSpace() {
6433    return datanodeStatistics.getBlockPoolUsed();
6434  }
6435
6436  @Override // NameNodeMXBean
6437  public float getPercentBlockPoolUsed() {
6438    return datanodeStatistics.getPercentBlockPoolUsed();
6439  }
6440
6441  @Override // NameNodeMXBean
6442  public float getPercentRemaining() {
6443    return datanodeStatistics.getCapacityRemainingPercent();
6444  }
6445
6446  @Override // NameNodeMXBean
6447  public long getCacheCapacity() {
6448    return datanodeStatistics.getCacheCapacity();
6449  }
6450
6451  @Override // NameNodeMXBean
6452  public long getCacheUsed() {
6453    return datanodeStatistics.getCacheUsed();
6454  }
6455
6456  @Override // NameNodeMXBean
6457  public long getTotalBlocks() {
6458    return getBlocksTotal();
6459  }
6460
6461  @Override // NameNodeMXBean
6462  @Metric
6463  public long getTotalFiles() {
6464    return getFilesTotal();
6465  }
6466
6467  @Override // NameNodeMXBean
6468  public long getNumberOfMissingBlocks() {
6469    return getMissingBlocksCount();
6470  }
6471  
6472  @Override // NameNodeMXBean
6473  public int getThreads() {
6474    return ManagementFactory.getThreadMXBean().getThreadCount();
6475  }
6476
6477  /**
6478   * Returned information is a JSON representation of map with host name as the
6479   * key and value is a map of live node attribute keys to its values
6480   */
6481  @Override // NameNodeMXBean
6482  public String getLiveNodes() {
6483    final Map<String, Map<String,Object>> info = 
6484      new HashMap<String, Map<String,Object>>();
6485    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6486    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6487    for (DatanodeDescriptor node : live) {
6488      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
6489          .put("infoAddr", node.getInfoAddr())
6490          .put("infoSecureAddr", node.getInfoSecureAddr())
6491          .put("xferaddr", node.getXferAddr())
6492          .put("lastContact", getLastContact(node))
6493          .put("usedSpace", getDfsUsed(node))
6494          .put("adminState", node.getAdminState().toString())
6495          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6496          .put("capacity", node.getCapacity())
6497          .put("numBlocks", node.numBlocks())
6498          .put("version", node.getSoftwareVersion())
6499          .put("used", node.getDfsUsed())
6500          .put("remaining", node.getRemaining())
6501          .put("blockScheduled", node.getBlocksScheduled())
6502          .put("blockPoolUsed", node.getBlockPoolUsed())
6503          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6504          .put("volfails", node.getVolumeFailures())
6505          .build();
6506
6507      info.put(node.getHostName(), innerinfo);
6508    }
6509    return JSON.toString(info);
6510  }
6511
6512  /**
6513   * Returned information is a JSON representation of map with host name as the
6514   * key and value is a map of dead node attribute keys to its values
6515   */
6516  @Override // NameNodeMXBean
6517  public String getDeadNodes() {
6518    final Map<String, Map<String, Object>> info = 
6519      new HashMap<String, Map<String, Object>>();
6520    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6521    blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
6522    for (DatanodeDescriptor node : dead) {
6523      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
6524          .put("lastContact", getLastContact(node))
6525          .put("decommissioned", node.isDecommissioned())
6526          .put("xferaddr", node.getXferAddr())
6527          .build();
6528      info.put(node.getHostName(), innerinfo);
6529    }
6530    return JSON.toString(info);
6531  }
6532
6533  /**
6534   * Returned information is a JSON representation of map with host name as the
6535   * key and value is a map of decomisioning node attribute keys to its values
6536   */
6537  @Override // NameNodeMXBean
6538  public String getDecomNodes() {
6539    final Map<String, Map<String, Object>> info = 
6540      new HashMap<String, Map<String, Object>>();
6541    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
6542        ).getDecommissioningNodes();
6543    for (DatanodeDescriptor node : decomNodeList) {
6544      Map<String, Object> innerinfo = ImmutableMap
6545          .<String, Object> builder()
6546          .put("xferaddr", node.getXferAddr())
6547          .put("underReplicatedBlocks",
6548              node.decommissioningStatus.getUnderReplicatedBlocks())
6549          .put("decommissionOnlyReplicas",
6550              node.decommissioningStatus.getDecommissionOnlyReplicas())
6551          .put("underReplicateInOpenFiles",
6552              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
6553          .build();
6554      info.put(node.getHostName(), innerinfo);
6555    }
6556    return JSON.toString(info);
6557  }
6558
6559  private long getLastContact(DatanodeDescriptor alivenode) {
6560    return (Time.now() - alivenode.getLastUpdate())/1000;
6561  }
6562
6563  private long getDfsUsed(DatanodeDescriptor alivenode) {
6564    return alivenode.getDfsUsed();
6565  }
6566
6567  @Override  // NameNodeMXBean
6568  public String getClusterId() {
6569    return dir.fsImage.getStorage().getClusterID();
6570  }
6571  
6572  @Override  // NameNodeMXBean
6573  public String getBlockPoolId() {
6574    return blockPoolId;
6575  }
6576  
6577  @Override  // NameNodeMXBean
6578  public String getNameDirStatuses() {
6579    Map<String, Map<File, StorageDirType>> statusMap =
6580      new HashMap<String, Map<File, StorageDirType>>();
6581    
6582    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
6583    for (Iterator<StorageDirectory> it
6584        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
6585      StorageDirectory st = it.next();
6586      activeDirs.put(st.getRoot(), st.getStorageDirType());
6587    }
6588    statusMap.put("active", activeDirs);
6589    
6590    List<Storage.StorageDirectory> removedStorageDirs
6591        = getFSImage().getStorage().getRemovedStorageDirs();
6592    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
6593    for (StorageDirectory st : removedStorageDirs) {
6594      failedDirs.put(st.getRoot(), st.getStorageDirType());
6595    }
6596    statusMap.put("failed", failedDirs);
6597    
6598    return JSON.toString(statusMap);
6599  }
6600
6601  @Override // NameNodeMXBean
6602  public String getNodeUsage() {
6603    float median = 0;
6604    float max = 0;
6605    float min = 0;
6606    float dev = 0;
6607
6608    final Map<String, Map<String,Object>> info =
6609        new HashMap<String, Map<String,Object>>();
6610    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6611    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6612
6613    if (live.size() > 0) {
6614      float totalDfsUsed = 0;
6615      float[] usages = new float[live.size()];
6616      int i = 0;
6617      for (DatanodeDescriptor dn : live) {
6618        usages[i++] = dn.getDfsUsedPercent();
6619        totalDfsUsed += dn.getDfsUsedPercent();
6620      }
6621      totalDfsUsed /= live.size();
6622      Arrays.sort(usages);
6623      median = usages[usages.length / 2];
6624      max = usages[usages.length - 1];
6625      min = usages[0];
6626
6627      for (i = 0; i < usages.length; i++) {
6628        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
6629      }
6630      dev = (float) Math.sqrt(dev / usages.length);
6631    }
6632
6633    final Map<String, Object> innerInfo = new HashMap<String, Object>();
6634    innerInfo.put("min", StringUtils.format("%.2f%%", min));
6635    innerInfo.put("median", StringUtils.format("%.2f%%", median));
6636    innerInfo.put("max", StringUtils.format("%.2f%%", max));
6637    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
6638    info.put("nodeUsage", innerInfo);
6639
6640    return JSON.toString(info);
6641  }
6642
6643  @Override  // NameNodeMXBean
6644  public String getNameJournalStatus() {
6645    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
6646    FSEditLog log = getFSImage().getEditLog();
6647    if (log != null) {
6648      boolean openForWrite = log.isOpenForWrite();
6649      for (JournalAndStream jas : log.getJournals()) {
6650        final Map<String, String> jasMap = new HashMap<String, String>();
6651        String manager = jas.getManager().toString();
6652
6653        jasMap.put("required", String.valueOf(jas.isRequired()));
6654        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
6655        jasMap.put("manager", manager);
6656
6657        if (jas.isDisabled()) {
6658          jasMap.put("stream", "Failed");
6659        } else if (openForWrite) {
6660          EditLogOutputStream elos = jas.getCurrentStream();
6661          if (elos != null) {
6662            jasMap.put("stream", elos.generateReport());
6663          } else {
6664            jasMap.put("stream", "not currently writing");
6665          }
6666        } else {
6667          jasMap.put("stream", "open for read");
6668        }
6669        jasList.add(jasMap);
6670      }
6671    }
6672    return JSON.toString(jasList);
6673  }
6674
6675  @Override // NameNodeMxBean
6676  public String getJournalTransactionInfo() {
6677    Map<String, String> txnIdMap = new HashMap<String, String>();
6678    txnIdMap.put("LastAppliedOrWrittenTxId",
6679        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
6680    txnIdMap.put("MostRecentCheckpointTxId",
6681        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
6682    return JSON.toString(txnIdMap);
6683  }
6684  
6685  @Override  // NameNodeMXBean
6686  public String getNNStarted() {
6687    return getStartTime().toString();
6688  }
6689
6690  @Override  // NameNodeMXBean
6691  public String getCompileInfo() {
6692    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
6693        " from " + VersionInfo.getBranch();
6694  }
6695
6696  /** @return the block manager. */
6697  public BlockManager getBlockManager() {
6698    return blockManager;
6699  }
6700  /** @return the FSDirectory. */
6701  public FSDirectory getFSDirectory() {
6702    return dir;
6703  }
6704  /** @return the cache manager. */
6705  public CacheManager getCacheManager() {
6706    return cacheManager;
6707  }
6708
6709  @Override  // NameNodeMXBean
6710  public String getCorruptFiles() {
6711    List<String> list = new ArrayList<String>();
6712    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
6713    try {
6714      corruptFileBlocks = listCorruptFileBlocks("/", null);
6715      int corruptFileCount = corruptFileBlocks.size();
6716      if (corruptFileCount != 0) {
6717        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
6718          list.add(c.toString());
6719        }
6720      }
6721    } catch (IOException e) {
6722      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
6723    }
6724    return JSON.toString(list);
6725  }
6726
6727  @Override  //NameNodeMXBean
6728  public int getDistinctVersionCount() {
6729    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
6730      .size();
6731  }
6732
6733  @Override  //NameNodeMXBean
6734  public Map<String, Integer> getDistinctVersions() {
6735    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
6736  }
6737
6738  @Override  //NameNodeMXBean
6739  public String getSoftwareVersion() {
6740    return VersionInfo.getVersion();
6741  }
6742
6743  /**
6744   * Verifies that the given identifier and password are valid and match.
6745   * @param identifier Token identifier.
6746   * @param password Password in the token.
6747   */
6748  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
6749      byte[] password) throws InvalidToken, RetriableException {
6750    try {
6751      getDelegationTokenSecretManager().verifyToken(identifier, password);
6752    } catch (InvalidToken it) {
6753      if (inTransitionToActive()) {
6754        throw new RetriableException(it);
6755      }
6756      throw it;
6757    }
6758  }
6759  
6760  @Override
6761  public boolean isGenStampInFuture(Block block) {
6762    if (isLegacyBlock(block)) {
6763      return block.getGenerationStamp() > getGenerationStampV1();
6764    } else {
6765      return block.getGenerationStamp() > getGenerationStampV2();
6766    }
6767  }
6768
6769  @VisibleForTesting
6770  public EditLogTailer getEditLogTailer() {
6771    return editLogTailer;
6772  }
6773  
6774  @VisibleForTesting
6775  public void setEditLogTailerForTests(EditLogTailer tailer) {
6776    this.editLogTailer = tailer;
6777  }
6778  
6779  @VisibleForTesting
6780  void setFsLockForTests(ReentrantReadWriteLock lock) {
6781    this.fsLock.coarseLock = lock;
6782  }
6783  
6784  @VisibleForTesting
6785  ReentrantReadWriteLock getFsLockForTests() {
6786    return fsLock.coarseLock;
6787  }
6788
6789  @VisibleForTesting
6790  public SafeModeInfo getSafeModeInfoForTests() {
6791    return safeMode;
6792  }
6793  
6794  @VisibleForTesting
6795  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
6796    this.nnResourceChecker = nnResourceChecker;
6797  }
6798
6799  @Override
6800  public boolean isAvoidingStaleDataNodesForWrite() {
6801    return this.blockManager.getDatanodeManager()
6802        .shouldAvoidStaleDataNodesForWrite();
6803  }
6804
6805  @Override // FSClusterStats
6806  public int getNumDatanodesInService() {
6807    return getNumLiveDataNodes() - getNumDecomLiveDataNodes();
6808  }
6809
6810  public SnapshotManager getSnapshotManager() {
6811    return snapshotManager;
6812  }
6813  
6814  /** Allow snapshot on a directroy. */
6815  void allowSnapshot(String path) throws SafeModeException, IOException {
6816    checkOperation(OperationCategory.WRITE);
6817    writeLock();
6818    try {
6819      checkOperation(OperationCategory.WRITE);
6820      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
6821      checkSuperuserPrivilege();
6822
6823      dir.writeLock();
6824      try {
6825        snapshotManager.setSnapshottable(path, true);
6826      } finally {
6827        dir.writeUnlock();
6828      }
6829      getEditLog().logAllowSnapshot(path);
6830    } finally {
6831      writeUnlock();
6832    }
6833    getEditLog().logSync();
6834
6835    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6836      logAuditEvent(true, "allowSnapshot", path, null, null);
6837    }
6838  }
6839  
6840  /** Disallow snapshot on a directory. */
6841  void disallowSnapshot(String path) throws SafeModeException, IOException {
6842    checkOperation(OperationCategory.WRITE);
6843    writeLock();
6844    try {
6845      checkOperation(OperationCategory.WRITE);
6846      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
6847      checkSuperuserPrivilege();
6848
6849      dir.writeLock();
6850      try {
6851        snapshotManager.resetSnapshottable(path);
6852      } finally {
6853        dir.writeUnlock();
6854      }
6855      getEditLog().logDisallowSnapshot(path);
6856    } finally {
6857      writeUnlock();
6858    }
6859    getEditLog().logSync();
6860    
6861    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6862      logAuditEvent(true, "disallowSnapshot", path, null, null);
6863    }
6864  }
6865  
6866  /**
6867   * Create a snapshot
6868   * @param snapshotRoot The directory path where the snapshot is taken
6869   * @param snapshotName The name of the snapshot
6870   */
6871  String createSnapshot(String snapshotRoot, String snapshotName)
6872      throws SafeModeException, IOException {
6873    checkOperation(OperationCategory.WRITE);
6874    final FSPermissionChecker pc = getPermissionChecker();
6875    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
6876        null);
6877    if (cacheEntry != null && cacheEntry.isSuccess()) {
6878      return (String) cacheEntry.getPayload();
6879    }
6880    writeLock();
6881    String snapshotPath = null;
6882    try {
6883      checkOperation(OperationCategory.WRITE);
6884      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
6885      if (isPermissionEnabled) {
6886        checkOwner(pc, snapshotRoot);
6887      }
6888
6889      if (snapshotName == null || snapshotName.isEmpty()) {
6890        snapshotName = Snapshot.generateDefaultSnapshotName();
6891      }
6892      dir.verifySnapshotName(snapshotName, snapshotRoot);
6893      dir.writeLock();
6894      try {
6895        snapshotPath = snapshotManager.createSnapshot(snapshotRoot, snapshotName);
6896      } finally {
6897        dir.writeUnlock();
6898      }
6899      getEditLog().logCreateSnapshot(snapshotRoot, snapshotName,
6900          cacheEntry != null);
6901    } finally {
6902      writeUnlock();
6903      RetryCache.setState(cacheEntry, snapshotPath != null, snapshotPath);
6904    }
6905    getEditLog().logSync();
6906    
6907    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6908      logAuditEvent(true, "createSnapshot", snapshotRoot, snapshotPath, null);
6909    }
6910    return snapshotPath;
6911  }
6912  
6913  /**
6914   * Rename a snapshot
6915   * @param path The directory path where the snapshot was taken
6916   * @param snapshotOldName Old snapshot name
6917   * @param snapshotNewName New snapshot name
6918   * @throws SafeModeException
6919   * @throws IOException 
6920   */
6921  void renameSnapshot(String path, String snapshotOldName,
6922      String snapshotNewName) throws SafeModeException, IOException {
6923    checkOperation(OperationCategory.WRITE);
6924    final FSPermissionChecker pc = getPermissionChecker();
6925    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6926    if (cacheEntry != null && cacheEntry.isSuccess()) {
6927      return; // Return previous response
6928    }
6929    writeLock();
6930    boolean success = false;
6931    try {
6932      checkOperation(OperationCategory.WRITE);
6933      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
6934      if (isPermissionEnabled) {
6935        checkOwner(pc, path);
6936      }
6937      dir.verifySnapshotName(snapshotNewName, path);
6938      
6939      snapshotManager.renameSnapshot(path, snapshotOldName, snapshotNewName);
6940      getEditLog().logRenameSnapshot(path, snapshotOldName, snapshotNewName,
6941          cacheEntry != null);
6942      success = true;
6943    } finally {
6944      writeUnlock();
6945      RetryCache.setState(cacheEntry, success);
6946    }
6947    getEditLog().logSync();
6948    
6949    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6950      String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
6951      String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
6952      logAuditEvent(true, "renameSnapshot", oldSnapshotRoot, newSnapshotRoot, null);
6953    }
6954  }
6955  
6956  /**
6957   * Get the list of snapshottable directories that are owned 
6958   * by the current user. Return all the snapshottable directories if the 
6959   * current user is a super user.
6960   * @return The list of all the current snapshottable directories
6961   * @throws IOException
6962   */
6963  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
6964      throws IOException {
6965    SnapshottableDirectoryStatus[] status = null;
6966    checkOperation(OperationCategory.READ);
6967    final FSPermissionChecker checker = getPermissionChecker();
6968    readLock();
6969    try {
6970      checkOperation(OperationCategory.READ);
6971      final String user = checker.isSuperUser()? null : checker.getUser();
6972      status = snapshotManager.getSnapshottableDirListing(user);
6973    } finally {
6974      readUnlock();
6975    }
6976    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6977      logAuditEvent(true, "listSnapshottableDirectory", null, null, null);
6978    }
6979    return status;
6980  }
6981  
6982  /**
6983   * Get the difference between two snapshots (or between a snapshot and the
6984   * current status) of a snapshottable directory.
6985   * 
6986   * @param path The full path of the snapshottable directory.
6987   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
6988   *          or empty string indicates the current tree.
6989   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
6990   *          empty string indicates the current tree.
6991   * @return A report about the difference between {@code fromSnapshot} and 
6992   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
6993   *         directories belonging to the snapshottable directories are listed 
6994   *         and labeled as M/-/+/R respectively. 
6995   * @throws IOException
6996   */
6997  SnapshotDiffReport getSnapshotDiffReport(String path,
6998      String fromSnapshot, String toSnapshot) throws IOException {
6999    SnapshotDiffInfo diffs = null;
7000    checkOperation(OperationCategory.READ);
7001    final FSPermissionChecker pc = getPermissionChecker();
7002    readLock();
7003    try {
7004      checkOperation(OperationCategory.READ);
7005      if (isPermissionEnabled) {
7006        checkSubtreeReadPermission(pc, path, fromSnapshot);
7007        checkSubtreeReadPermission(pc, path, toSnapshot);
7008      }
7009      diffs = snapshotManager.diff(path, fromSnapshot, toSnapshot);
7010    } finally {
7011      readUnlock();
7012    }
7013    
7014    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7015      logAuditEvent(true, "computeSnapshotDiff", null, null, null);
7016    }
7017    return diffs != null ? diffs.generateReport() : new SnapshotDiffReport(
7018        path, fromSnapshot, toSnapshot,
7019        Collections.<DiffReportEntry> emptyList());
7020  }
7021  
7022  private void checkSubtreeReadPermission(final FSPermissionChecker pc,
7023      final String snapshottablePath, final String snapshot)
7024          throws AccessControlException, UnresolvedLinkException {
7025    final String fromPath = snapshot == null?
7026        snapshottablePath: Snapshot.getSnapshotPath(snapshottablePath, snapshot);
7027    checkPermission(pc, fromPath, false, null, null, FsAction.READ, FsAction.READ);
7028  }
7029  
7030  /**
7031   * Delete a snapshot of a snapshottable directory
7032   * @param snapshotRoot The snapshottable directory
7033   * @param snapshotName The name of the to-be-deleted snapshot
7034   * @throws SafeModeException
7035   * @throws IOException
7036   */
7037  void deleteSnapshot(String snapshotRoot, String snapshotName)
7038      throws SafeModeException, IOException {
7039    checkOperation(OperationCategory.WRITE);
7040    final FSPermissionChecker pc = getPermissionChecker();
7041    
7042    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7043    if (cacheEntry != null && cacheEntry.isSuccess()) {
7044      return; // Return previous response
7045    }
7046    boolean success = false;
7047    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
7048    writeLock();
7049    try {
7050      checkOperation(OperationCategory.WRITE);
7051      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7052      if (isPermissionEnabled) {
7053        checkOwner(pc, snapshotRoot);
7054      }
7055
7056      List<INode> removedINodes = new ChunkedArrayList<INode>();
7057      dir.writeLock();
7058      try {
7059        snapshotManager.deleteSnapshot(snapshotRoot, snapshotName,
7060            collectedBlocks, removedINodes);
7061        dir.removeFromInodeMap(removedINodes);
7062      } finally {
7063        dir.writeUnlock();
7064      }
7065      removedINodes.clear();
7066      getEditLog().logDeleteSnapshot(snapshotRoot, snapshotName,
7067          cacheEntry != null);
7068      success = true;
7069    } finally {
7070      writeUnlock();
7071      RetryCache.setState(cacheEntry, success);
7072    }
7073    getEditLog().logSync();
7074
7075    removeBlocks(collectedBlocks);
7076    collectedBlocks.clear();
7077
7078    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7079      String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7080      logAuditEvent(true, "deleteSnapshot", rootPath, null, null);
7081    }
7082  }
7083
7084  /**
7085   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7086   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7087   */
7088  void removeSnapshottableDirs(List<INodeDirectorySnapshottable> toRemove) {
7089    if (snapshotManager != null) {
7090      snapshotManager.removeSnapshottable(toRemove);
7091    }
7092  }
7093
7094  long addCacheDirective(CacheDirectiveInfo directive, EnumSet<CacheFlag> flags)
7095      throws IOException {
7096    checkOperation(OperationCategory.WRITE);
7097    final FSPermissionChecker pc = isPermissionEnabled ?
7098        getPermissionChecker() : null;
7099    CacheEntryWithPayload cacheEntry =
7100        RetryCache.waitForCompletion(retryCache, null);
7101    if (cacheEntry != null && cacheEntry.isSuccess()) {
7102      return (Long) cacheEntry.getPayload();
7103    }
7104    boolean success = false;
7105    if (!flags.contains(CacheFlag.FORCE)) {
7106      cacheManager.waitForRescanIfNeeded();
7107    }
7108    writeLock();
7109    Long result = null;
7110    try {
7111      checkOperation(OperationCategory.WRITE);
7112      if (isInSafeMode()) {
7113        throw new SafeModeException(
7114            "Cannot add cache directive", safeMode);
7115      }
7116      if (directive.getId() != null) {
7117        throw new IOException("addDirective: you cannot specify an ID " +
7118            "for this operation.");
7119      }
7120      CacheDirectiveInfo effectiveDirective = 
7121          cacheManager.addDirective(directive, pc, flags);
7122      getEditLog().logAddCacheDirectiveInfo(effectiveDirective,
7123          cacheEntry != null);
7124      result = effectiveDirective.getId();
7125      success = true;
7126    } finally {
7127      writeUnlock();
7128      if (success) {
7129        getEditLog().logSync();
7130      }
7131      if (isAuditEnabled() && isExternalInvocation()) {
7132        logAuditEvent(success, "addCacheDirective", null, null, null);
7133      }
7134      RetryCache.setState(cacheEntry, success, result);
7135    }
7136    return result;
7137  }
7138
7139  void modifyCacheDirective(CacheDirectiveInfo directive,
7140      EnumSet<CacheFlag> flags) throws IOException {
7141    checkOperation(OperationCategory.WRITE);
7142    final FSPermissionChecker pc = isPermissionEnabled ?
7143        getPermissionChecker() : null;
7144    boolean success = false;
7145    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7146    if (cacheEntry != null && cacheEntry.isSuccess()) {
7147      return;
7148    }
7149    if (!flags.contains(CacheFlag.FORCE)) {
7150      cacheManager.waitForRescanIfNeeded();
7151    }
7152    writeLock();
7153    try {
7154      checkOperation(OperationCategory.WRITE);
7155      if (isInSafeMode()) {
7156        throw new SafeModeException(
7157            "Cannot add cache directive", safeMode);
7158      }
7159      cacheManager.modifyDirective(directive, pc, flags);
7160      getEditLog().logModifyCacheDirectiveInfo(directive,
7161          cacheEntry != null);
7162      success = true;
7163    } finally {
7164      writeUnlock();
7165      if (success) {
7166        getEditLog().logSync();
7167      }
7168      if (isAuditEnabled() && isExternalInvocation()) {
7169        logAuditEvent(success, "modifyCacheDirective", null, null, null);
7170      }
7171      RetryCache.setState(cacheEntry, success);
7172    }
7173  }
7174
7175  void removeCacheDirective(Long id) throws IOException {
7176    checkOperation(OperationCategory.WRITE);
7177    final FSPermissionChecker pc = isPermissionEnabled ?
7178        getPermissionChecker() : null;
7179    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7180    if (cacheEntry != null && cacheEntry.isSuccess()) {
7181      return;
7182    }
7183    boolean success = false;
7184    writeLock();
7185    try {
7186      checkOperation(OperationCategory.WRITE);
7187      if (isInSafeMode()) {
7188        throw new SafeModeException(
7189            "Cannot remove cache directives", safeMode);
7190      }
7191      cacheManager.removeDirective(id, pc);
7192      getEditLog().logRemoveCacheDirectiveInfo(id, cacheEntry != null);
7193      success = true;
7194    } finally {
7195      writeUnlock();
7196      if (isAuditEnabled() && isExternalInvocation()) {
7197        logAuditEvent(success, "removeCacheDirective", null, null,
7198            null);
7199      }
7200      RetryCache.setState(cacheEntry, success);
7201    }
7202    getEditLog().logSync();
7203  }
7204
7205  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7206      long startId, CacheDirectiveInfo filter) throws IOException {
7207    checkOperation(OperationCategory.READ);
7208    final FSPermissionChecker pc = isPermissionEnabled ?
7209        getPermissionChecker() : null;
7210    BatchedListEntries<CacheDirectiveEntry> results;
7211    cacheManager.waitForRescanIfNeeded();
7212    readLock();
7213    boolean success = false;
7214    try {
7215      checkOperation(OperationCategory.READ);
7216      results =
7217          cacheManager.listCacheDirectives(startId, filter, pc);
7218      success = true;
7219    } finally {
7220      readUnlock();
7221      if (isAuditEnabled() && isExternalInvocation()) {
7222        logAuditEvent(success, "listCacheDirectives", null, null,
7223            null);
7224      }
7225    }
7226    return results;
7227  }
7228
7229  public void addCachePool(CachePoolInfo req) throws IOException {
7230    checkOperation(OperationCategory.WRITE);
7231    final FSPermissionChecker pc = isPermissionEnabled ?
7232        getPermissionChecker() : null;
7233    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7234    if (cacheEntry != null && cacheEntry.isSuccess()) {
7235      return; // Return previous response
7236    }
7237    writeLock();
7238    boolean success = false;
7239    try {
7240      checkOperation(OperationCategory.WRITE);
7241      if (isInSafeMode()) {
7242        throw new SafeModeException(
7243            "Cannot add cache pool " + req.getPoolName(), safeMode);
7244      }
7245      if (pc != null) {
7246        pc.checkSuperuserPrivilege();
7247      }
7248      CachePoolInfo info = cacheManager.addCachePool(req);
7249      getEditLog().logAddCachePool(info, cacheEntry != null);
7250      success = true;
7251    } finally {
7252      writeUnlock();
7253      if (isAuditEnabled() && isExternalInvocation()) {
7254        logAuditEvent(success, "addCachePool", req.getPoolName(), null, null);
7255      }
7256      RetryCache.setState(cacheEntry, success);
7257    }
7258    
7259    getEditLog().logSync();
7260  }
7261
7262  public void modifyCachePool(CachePoolInfo req) throws IOException {
7263    checkOperation(OperationCategory.WRITE);
7264    final FSPermissionChecker pc =
7265        isPermissionEnabled ? getPermissionChecker() : null;
7266    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7267    if (cacheEntry != null && cacheEntry.isSuccess()) {
7268      return; // Return previous response
7269    }
7270    writeLock();
7271    boolean success = false;
7272    try {
7273      checkOperation(OperationCategory.WRITE);
7274      if (isInSafeMode()) {
7275        throw new SafeModeException(
7276            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7277      }
7278      if (pc != null) {
7279        pc.checkSuperuserPrivilege();
7280      }
7281      cacheManager.modifyCachePool(req);
7282      getEditLog().logModifyCachePool(req, cacheEntry != null);
7283      success = true;
7284    } finally {
7285      writeUnlock();
7286      if (isAuditEnabled() && isExternalInvocation()) {
7287        logAuditEvent(success, "modifyCachePool", req.getPoolName(), null, null);
7288      }
7289      RetryCache.setState(cacheEntry, success);
7290    }
7291
7292    getEditLog().logSync();
7293  }
7294
7295  public void removeCachePool(String cachePoolName) throws IOException {
7296    checkOperation(OperationCategory.WRITE);
7297    final FSPermissionChecker pc =
7298        isPermissionEnabled ? getPermissionChecker() : null;
7299    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7300    if (cacheEntry != null && cacheEntry.isSuccess()) {
7301      return; // Return previous response
7302    }
7303    writeLock();
7304    boolean success = false;
7305    try {
7306      checkOperation(OperationCategory.WRITE);
7307      if (isInSafeMode()) {
7308        throw new SafeModeException(
7309            "Cannot remove cache pool " + cachePoolName, safeMode);
7310      }
7311      if (pc != null) {
7312        pc.checkSuperuserPrivilege();
7313      }
7314      cacheManager.removeCachePool(cachePoolName);
7315      getEditLog().logRemoveCachePool(cachePoolName, cacheEntry != null);
7316      success = true;
7317    } finally {
7318      writeUnlock();
7319      if (isAuditEnabled() && isExternalInvocation()) {
7320        logAuditEvent(success, "removeCachePool", cachePoolName, null, null);
7321      }
7322      RetryCache.setState(cacheEntry, success);
7323    }
7324    
7325    getEditLog().logSync();
7326  }
7327
7328  public BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7329      throws IOException {
7330    final FSPermissionChecker pc =
7331        isPermissionEnabled ? getPermissionChecker() : null;
7332    BatchedListEntries<CachePoolEntry> results;
7333    checkOperation(OperationCategory.READ);
7334    boolean success = false;
7335    cacheManager.waitForRescanIfNeeded();
7336    readLock();
7337    try {
7338      checkOperation(OperationCategory.READ);
7339      results = cacheManager.listCachePools(pc, prevKey);
7340      success = true;
7341    } finally {
7342      readUnlock();
7343      if (isAuditEnabled() && isExternalInvocation()) {
7344        logAuditEvent(success, "listCachePools", null, null, null);
7345      }
7346    }
7347    return results;
7348  }
7349
7350  /**
7351   * Default AuditLogger implementation; used when no access logger is
7352   * defined in the config file. It can also be explicitly listed in the
7353   * config file.
7354   */
7355  private static class DefaultAuditLogger extends HdfsAuditLogger {
7356
7357    private boolean logTokenTrackingId;
7358
7359    @Override
7360    public void initialize(Configuration conf) {
7361      logTokenTrackingId = conf.getBoolean(
7362          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
7363          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
7364    }
7365
7366    @Override
7367    public void logAuditEvent(boolean succeeded, String userName,
7368        InetAddress addr, String cmd, String src, String dst,
7369        FileStatus status, UserGroupInformation ugi,
7370        DelegationTokenSecretManager dtSecretManager) {
7371      if (auditLog.isInfoEnabled()) {
7372        final StringBuilder sb = auditBuffer.get();
7373        sb.setLength(0);
7374        sb.append("allowed=").append(succeeded).append("\t");
7375        sb.append("ugi=").append(userName).append("\t");
7376        sb.append("ip=").append(addr).append("\t");
7377        sb.append("cmd=").append(cmd).append("\t");
7378        sb.append("src=").append(src).append("\t");
7379        sb.append("dst=").append(dst).append("\t");
7380        if (null == status) {
7381          sb.append("perm=null");
7382        } else {
7383          sb.append("perm=");
7384          sb.append(status.getOwner()).append(":");
7385          sb.append(status.getGroup()).append(":");
7386          sb.append(status.getPermission());
7387        }
7388        if (logTokenTrackingId) {
7389          sb.append("\t").append("trackingId=");
7390          String trackingId = null;
7391          if (ugi != null && dtSecretManager != null
7392              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
7393            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
7394              if (tid instanceof DelegationTokenIdentifier) {
7395                DelegationTokenIdentifier dtid =
7396                    (DelegationTokenIdentifier)tid;
7397                trackingId = dtSecretManager.getTokenTrackingId(dtid);
7398                break;
7399              }
7400            }
7401          }
7402          sb.append(trackingId);
7403        }
7404        logAuditMessage(sb.toString());
7405      }
7406    }
7407
7408    public void logAuditMessage(String message) {
7409      auditLog.info(message);
7410    }
7411  }
7412
7413  private static void enableAsyncAuditLog() {
7414    if (!(auditLog instanceof Log4JLogger)) {
7415      LOG.warn("Log4j is required to enable async auditlog");
7416      return;
7417    }
7418    Logger logger = ((Log4JLogger)auditLog).getLogger();
7419    @SuppressWarnings("unchecked")
7420    List<Appender> appenders = Collections.list(logger.getAllAppenders());
7421    // failsafe against trying to async it more than once
7422    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
7423      AsyncAppender asyncAppender = new AsyncAppender();
7424      // change logger to have an async appender containing all the
7425      // previously configured appenders
7426      for (Appender appender : appenders) {
7427        logger.removeAppender(appender);
7428        asyncAppender.addAppender(appender);
7429      }
7430      logger.addAppender(asyncAppender);        
7431    }
7432  }
7433}
7434