001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.Time.now;
021
022import java.io.DataInput;
023import java.io.DataInputStream;
024import java.io.DataOutputStream;
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileNotFoundException;
028import java.io.FileOutputStream;
029import java.io.IOException;
030import java.security.DigestInputStream;
031import java.security.DigestOutputStream;
032import java.security.MessageDigest;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.HashMap;
036import java.util.List;
037import java.util.Map;
038
039import org.apache.commons.logging.Log;
040import org.apache.hadoop.HadoopIllegalArgumentException;
041import org.apache.hadoop.classification.InterfaceAudience;
042import org.apache.hadoop.classification.InterfaceStability;
043import org.apache.hadoop.conf.Configuration;
044import org.apache.hadoop.fs.FileSystem;
045import org.apache.hadoop.fs.Path;
046import org.apache.hadoop.fs.PathIsNotDirectoryException;
047import org.apache.hadoop.fs.UnresolvedLinkException;
048import org.apache.hadoop.fs.permission.PermissionStatus;
049import org.apache.hadoop.hdfs.protocol.HdfsConstants;
050import org.apache.hadoop.hdfs.protocol.LayoutVersion;
051import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
052import org.apache.hadoop.hdfs.protocol.LayoutFlags;
053import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
054import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
055import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
056import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
057import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
058import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
059import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectoryWithSnapshot;
060import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileUnderConstructionWithSnapshot;
061import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
062import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
063import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
064import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
065import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
066import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
067import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
068import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
069import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
070import org.apache.hadoop.hdfs.util.ReadOnlyList;
071import org.apache.hadoop.io.MD5Hash;
072import org.apache.hadoop.io.Text;
073
074/**
075 * Contains inner classes for reading or writing the on-disk format for
076 * FSImages.
077 * 
078 * In particular, the format of the FSImage looks like:
079 * <pre>
080 * FSImage {
081 *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
082 *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
083 *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
084 *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
085 *   numOfSnapshottableDirs: int,
086 *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
087 * }
088 * 
089 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
090 *   INodeInfo of root, numberOfChildren of root: int
091 *   [list of INodeInfo of root's children],
092 *   [list of INodeDirectoryInfo of root's directory children]
093 * }
094 * 
095 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
096 *   [list of INodeInfo of INodes in topological order]
097 * }
098 * 
099 * INodeInfo {
100 *   {
101 *     localName: short + byte[]
102 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
103 *   or 
104 *   {
105 *     fullPath: byte[]
106 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
107 *   replicationFactor: short, modificationTime: long,
108 *   accessTime: long, preferredBlockSize: long,
109 *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
110 *   { 
111 *     nsQuota: long, dsQuota: long, 
112 *     {
113 *       isINodeSnapshottable: byte,
114 *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
115 *     } (when {@link Feature#SNAPSHOT} is supported), 
116 *     fsPermission: short, PermissionStatus
117 *   } for INodeDirectory
118 *   or 
119 *   {
120 *     symlinkString, fsPermission: short, PermissionStatus
121 *   } for INodeSymlink
122 *   or
123 *   {
124 *     [list of BlockInfo]
125 *     [list of FileDiff]
126 *     {
127 *       isINodeFileUnderConstructionSnapshot: byte, 
128 *       {clientName: short + byte[], clientMachine: short + byte[]} (when 
129 *       isINodeFileUnderConstructionSnapshot is true),
130 *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 
131 *     fsPermission: short, PermissionStatus
132 *   } for INodeFile
133 * }
134 * 
135 * INodeDirectoryInfo {
136 *   fullPath of the directory: short + byte[],
137 *   numberOfChildren: int, [list of INodeInfo of children INode],
138 *   {
139 *     numberOfSnapshots: int,
140 *     [list of Snapshot] (when NumberOfSnapshots is positive),
141 *     numberOfDirectoryDiffs: int,
142 *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
143 *     number of children that are directories,
144 *     [list of INodeDirectoryInfo of the directory children] (includes
145 *     snapshot copies of deleted sub-directories)
146 *   } (when {@link Feature#SNAPSHOT} is supported), 
147 * }
148 * 
149 * Snapshot {
150 *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 
151 *   the name of the snapshot)
152 * }
153 * 
154 * DirectoryDiff {
155 *   full path of the root of the associated Snapshot: short + byte[], 
156 *   childrenSize: int, 
157 *   isSnapshotRoot: byte, 
158 *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
159 *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 
160 * }
161 * 
162 * Diff {
163 *   createdListSize: int, [Local name of INode in created list],
164 *   deletedListSize: int, [INode in deleted list: INodeInfo]
165 * }
166 *
167 * FileDiff {
168 *   full path of the root of the associated Snapshot: short + byte[], 
169 *   fileSize: long, 
170 *   snapshotINodeIsNotNull: byte,
171 *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 
172 * }
173 * </pre>
174 */
175@InterfaceAudience.Private
176@InterfaceStability.Evolving
177public class FSImageFormat {
178  private static final Log LOG = FSImage.LOG;
179  
180  // Static-only class
181  private FSImageFormat() {}
182  
183  /**
184   * A one-shot class responsible for loading an image. The load() function
185   * should be called once, after which the getter methods may be used to retrieve
186   * information about the image that was loaded, if loading was successful.
187   */
188  public static class Loader {
189    private final Configuration conf;
190    /** which namesystem this loader is working for */
191    private final FSNamesystem namesystem;
192
193    /** Set to true once a file has been loaded using this loader. */
194    private boolean loaded = false;
195
196    /** The transaction ID of the last edit represented by the loaded file */
197    private long imgTxId;
198    /** The MD5 sum of the loaded file */
199    private MD5Hash imgDigest;
200    
201    private Map<Integer, Snapshot> snapshotMap = null;
202    private final ReferenceMap referenceMap = new ReferenceMap();
203
204    Loader(Configuration conf, FSNamesystem namesystem) {
205      this.conf = conf;
206      this.namesystem = namesystem;
207    }
208
209    /**
210     * Return the MD5 checksum of the image that has been loaded.
211     * @throws IllegalStateException if load() has not yet been called.
212     */
213    MD5Hash getLoadedImageMd5() {
214      checkLoaded();
215      return imgDigest;
216    }
217
218    long getLoadedImageTxId() {
219      checkLoaded();
220      return imgTxId;
221    }
222
223    /**
224     * Throw IllegalStateException if load() has not yet been called.
225     */
226    private void checkLoaded() {
227      if (!loaded) {
228        throw new IllegalStateException("Image not yet loaded!");
229      }
230    }
231
232    /**
233     * Throw IllegalStateException if load() has already been called.
234     */
235    private void checkNotLoaded() {
236      if (loaded) {
237        throw new IllegalStateException("Image already loaded!");
238      }
239    }
240
241    void load(File curFile) throws IOException {
242      checkNotLoaded();
243      assert curFile != null : "curFile is null";
244
245      StartupProgress prog = NameNode.getStartupProgress();
246      Step step = new Step(StepType.INODES);
247      prog.beginStep(Phase.LOADING_FSIMAGE, step);
248      long startTime = now();
249
250      //
251      // Load in bits
252      //
253      MessageDigest digester = MD5Hash.getDigester();
254      DigestInputStream fin = new DigestInputStream(
255           new FileInputStream(curFile), digester);
256
257      DataInputStream in = new DataInputStream(fin);
258      try {
259        // read image version: first appeared in version -1
260        int imgVersion = in.readInt();
261        if (getLayoutVersion() != imgVersion) {
262          throw new InconsistentFSStateException(curFile, 
263              "imgVersion " + imgVersion +
264              " expected to be " + getLayoutVersion());
265        }
266        boolean supportSnapshot = LayoutVersion.supports(Feature.SNAPSHOT,
267            imgVersion);
268        if (LayoutVersion.supports(Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
269          LayoutFlags.read(in);
270        }
271
272        // read namespaceID: first appeared in version -2
273        in.readInt();
274
275        long numFiles = in.readLong();
276
277        // read in the last generation stamp for legacy blocks.
278        long genstamp = in.readLong();
279        namesystem.setGenerationStampV1(genstamp);
280        
281        if (LayoutVersion.supports(Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
282          // read the starting generation stamp for sequential block IDs
283          genstamp = in.readLong();
284          namesystem.setGenerationStampV2(genstamp);
285
286          // read the last generation stamp for blocks created after
287          // the switch to sequential block IDs.
288          long stampAtIdSwitch = in.readLong();
289          namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
290
291          // read the max sequential block ID.
292          long maxSequentialBlockId = in.readLong();
293          namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
294        } else {
295          long startingGenStamp = namesystem.upgradeGenerationStampToV2();
296          // This is an upgrade.
297          LOG.info("Upgrading to sequential block IDs. Generation stamp " +
298                   "for new blocks set to " + startingGenStamp);
299        }
300
301        // read the transaction ID of the last edit represented by
302        // this image
303        if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) {
304          imgTxId = in.readLong();
305        } else {
306          imgTxId = 0;
307        }
308
309        // read the last allocated inode id in the fsimage
310        if (LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion)) {
311          long lastInodeId = in.readLong();
312          namesystem.resetLastInodeId(lastInodeId);
313          if (LOG.isDebugEnabled()) {
314            LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
315          }
316        } else {
317          if (LOG.isDebugEnabled()) {
318            LOG.debug("Old layout version doesn't have inode id."
319                + " Will assign new id for each inode.");
320          }
321        }
322        
323        if (supportSnapshot) {
324          snapshotMap = namesystem.getSnapshotManager().read(in, this);
325        }
326
327        // read compression related info
328        FSImageCompression compression;
329        if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imgVersion)) {
330          compression = FSImageCompression.readCompressionHeader(conf, in);
331        } else {
332          compression = FSImageCompression.createNoopCompression();
333        }
334        in = compression.unwrapInputStream(fin);
335
336        LOG.info("Loading image file " + curFile + " using " + compression);
337        
338        // load all inodes
339        LOG.info("Number of files = " + numFiles);
340        prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
341        Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
342        if (LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
343            imgVersion)) {
344          if (supportSnapshot) {
345            loadLocalNameINodesWithSnapshot(numFiles, in, counter);
346          } else {
347            loadLocalNameINodes(numFiles, in, counter);
348          }
349        } else {
350          loadFullNameINodes(numFiles, in, counter);
351        }
352
353        loadFilesUnderConstruction(in, supportSnapshot, counter);
354        prog.endStep(Phase.LOADING_FSIMAGE, step);
355        // Now that the step is finished, set counter equal to total to adjust
356        // for possible under-counting due to reference inodes.
357        prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
358
359        loadSecretManagerState(in);
360
361        loadCacheManagerState(in);
362
363        // make sure to read to the end of file
364        boolean eof = (in.read() == -1);
365        assert eof : "Should have reached the end of image file " + curFile;
366      } finally {
367        in.close();
368      }
369
370      imgDigest = new MD5Hash(digester.digest());
371      loaded = true;
372      
373      LOG.info("Image file " + curFile + " of size " + curFile.length() +
374          " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
375    }
376
377  /** Update the root node's attributes */
378  private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
379    long nsQuota = root.getNsQuota();
380    long dsQuota = root.getDsQuota();
381    FSDirectory fsDir = namesystem.dir;
382    if (nsQuota != -1 || dsQuota != -1) {
383      fsDir.rootDir.setQuota(nsQuota, dsQuota);
384    }
385    fsDir.rootDir.cloneModificationTime(root);
386    fsDir.rootDir.clonePermissionStatus(root);    
387  }
388  
389    /**
390     * Load fsimage files when 1) only local names are stored, 
391     * and 2) snapshot is supported.
392     * 
393     * @param numFiles number of files expected to be read
394     * @param in Image input stream
395     * @param counter Counter to increment for namenode startup progress
396     */
397    private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
398        Counter counter) throws IOException {
399      assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
400          getLayoutVersion());
401      assert LayoutVersion.supports(Feature.SNAPSHOT, getLayoutVersion());
402      
403      // load root
404      loadRoot(in, counter);
405      // load rest of the nodes recursively
406      loadDirectoryWithSnapshot(in, counter);
407    }
408    
409  /** 
410   * load fsimage files assuming only local names are stored
411   *   
412   * @param numFiles number of files expected to be read
413   * @param in image input stream
414   * @param counter Counter to increment for namenode startup progress
415   * @throws IOException
416   */  
417   private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
418       throws IOException {
419     assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
420         getLayoutVersion());
421     assert numFiles > 0;
422
423     // load root
424     loadRoot(in, counter);
425     // have loaded the first file (the root)
426     numFiles--; 
427
428     // load rest of the nodes directory by directory
429     while (numFiles > 0) {
430       numFiles -= loadDirectory(in, counter);
431     }
432     if (numFiles != 0) {
433       throw new IOException("Read unexpect number of files: " + -numFiles);
434     }
435   }
436   
437    /**
438     * Load information about root, and use the information to update the root
439     * directory of NameSystem.
440     * @param in The {@link DataInput} instance to read.
441     * @param counter Counter to increment for namenode startup progress
442     */
443    private void loadRoot(DataInput in, Counter counter)
444        throws IOException {
445      // load root
446      if (in.readShort() != 0) {
447        throw new IOException("First node is not root");
448      }
449      final INodeDirectory root = loadINode(null, false, in, counter)
450        .asDirectory();
451      // update the root's attributes
452      updateRootAttr(root);
453    }
454   
455    /** Load children nodes for the parent directory. */
456    private int loadChildren(INodeDirectory parent, DataInput in,
457        Counter counter) throws IOException {
458      int numChildren = in.readInt();
459      for (int i = 0; i < numChildren; i++) {
460        // load single inode
461        INode newNode = loadINodeWithLocalName(false, in, true, counter);
462        addToParent(parent, newNode);
463      }
464      return numChildren;
465    }
466    
467    /**
468     * Load a directory when snapshot is supported.
469     * @param in The {@link DataInput} instance to read.
470     * @param counter Counter to increment for namenode startup progress
471     */
472    private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
473        throws IOException {
474      // Step 1. Identify the parent INode
475      long inodeId = in.readLong();
476      final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
477          .asDirectory();
478      
479      // Check if the whole subtree has been saved (for reference nodes)
480      boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
481      if (!toLoadSubtree) {
482        return;
483      }
484      
485      // Step 2. Load snapshots if parent is snapshottable
486      int numSnapshots = in.readInt();
487      if (numSnapshots >= 0) {
488        final INodeDirectorySnapshottable snapshottableParent
489            = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
490        // load snapshots and snapshotQuota
491        SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
492            numSnapshots, in, this);
493        if (snapshottableParent.getSnapshotQuota() > 0) {
494          // add the directory to the snapshottable directory list in 
495          // SnapshotManager. Note that we only add root when its snapshot quota
496          // is positive.
497          this.namesystem.getSnapshotManager().addSnapshottable(
498              snapshottableParent);
499        }
500      }
501
502      // Step 3. Load children nodes under parent
503      loadChildren(parent, in, counter);
504      
505      // Step 4. load Directory Diff List
506      SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
507      
508      // Recursively load sub-directories, including snapshot copies of deleted
509      // directories
510      int numSubTree = in.readInt();
511      for (int i = 0; i < numSubTree; i++) {
512        loadDirectoryWithSnapshot(in, counter);
513      }
514    }
515    
516   /**
517    * Load all children of a directory
518    * 
519    * @param in
520    * @param counter Counter to increment for namenode startup progress
521    * @return number of child inodes read
522    * @throws IOException
523    */
524   private int loadDirectory(DataInput in, Counter counter) throws IOException {
525     String parentPath = FSImageSerialization.readString(in);
526     final INodeDirectory parent = INodeDirectory.valueOf(
527         namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
528     return loadChildren(parent, in, counter);
529   }
530
531  /**
532   * load fsimage files assuming full path names are stored
533   * 
534   * @param numFiles total number of files to load
535   * @param in data input stream
536   * @param counter Counter to increment for namenode startup progress
537   * @throws IOException if any error occurs
538   */
539  private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
540      throws IOException {
541    byte[][] pathComponents;
542    byte[][] parentPath = {{}};      
543    FSDirectory fsDir = namesystem.dir;
544    INodeDirectory parentINode = fsDir.rootDir;
545    for (long i = 0; i < numFiles; i++) {
546      pathComponents = FSImageSerialization.readPathComponents(in);
547      final INode newNode = loadINode(
548          pathComponents[pathComponents.length-1], false, in, counter);
549
550      if (isRoot(pathComponents)) { // it is the root
551        // update the root's attributes
552        updateRootAttr(newNode.asDirectory());
553        continue;
554      }
555      // check if the new inode belongs to the same parent
556      if(!isParent(pathComponents, parentPath)) {
557        parentINode = getParentINodeDirectory(pathComponents);
558        parentPath = getParent(pathComponents);
559      }
560
561      // add new inode
562      addToParent(parentINode, newNode);
563    }
564  }
565
566  private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
567      ) throws FileNotFoundException, PathIsNotDirectoryException,
568      UnresolvedLinkException {
569    if (pathComponents.length < 2) { // root
570      return null;
571    }
572    // Gets the parent INode
573    final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
574        pathComponents);
575    return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
576  }
577
578  /**
579   * Add the child node to parent and, if child is a file, update block map.
580   * This method is only used for image loading so that synchronization,
581   * modification time update and space count update are not needed.
582   */
583  private void addToParent(INodeDirectory parent, INode child) {
584    FSDirectory fsDir = namesystem.dir;
585    if (parent == fsDir.rootDir && FSDirectory.isReservedName(child)) {
586        throw new HadoopIllegalArgumentException("File name \""
587            + child.getLocalName() + "\" is reserved. Please "
588            + " change the name of the existing file or directory to another "
589            + "name before upgrading to this release.");
590    }
591    // NOTE: This does not update space counts for parents
592    if (!parent.addChild(child)) {
593      return;
594    }
595    namesystem.dir.cacheName(child);
596
597    if (child.isFile()) {
598      updateBlocksMap(child.asFile());
599    }
600  }
601
602    public void updateBlocksMap(INodeFile file) {
603      // Add file->block mapping
604      final BlockInfo[] blocks = file.getBlocks();
605      if (blocks != null) {
606        final BlockManager bm = namesystem.getBlockManager();
607        for (int i = 0; i < blocks.length; i++) {
608          file.setBlock(i, bm.addBlockCollection(blocks[i], file));
609        } 
610      }
611    }
612
613    /** @return The FSDirectory of the namesystem where the fsimage is loaded */
614    public FSDirectory getFSDirectoryInLoading() {
615      return namesystem.dir;
616    }
617
618    public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
619        boolean updateINodeMap) throws IOException {
620      return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
621    }
622
623    public INode loadINodeWithLocalName(boolean isSnapshotINode,
624        DataInput in, boolean updateINodeMap, Counter counter)
625        throws IOException {
626      final byte[] localName = FSImageSerialization.readLocalName(in);
627      INode inode = loadINode(localName, isSnapshotINode, in, counter);
628      if (updateINodeMap
629          && LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
630        namesystem.dir.addToInodeMap(inode);
631      }
632      return inode;
633    }
634  
635  /**
636   * load an inode from fsimage except for its name
637   * 
638   * @param in data input stream from which image is read
639   * @param counter Counter to increment for namenode startup progress
640   * @return an inode
641   */
642  @SuppressWarnings("deprecation")
643  INode loadINode(final byte[] localName, boolean isSnapshotINode,
644      DataInput in, Counter counter) throws IOException {
645    final int imgVersion = getLayoutVersion();
646    if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
647      namesystem.getFSDirectory().verifyINodeName(localName);
648    }
649
650    long inodeId = LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion) ? 
651           in.readLong() : namesystem.allocateNewInodeId();
652    
653    final short replication = namesystem.getBlockManager().adjustReplication(
654        in.readShort());
655    final long modificationTime = in.readLong();
656    long atime = 0;
657    if (LayoutVersion.supports(Feature.FILE_ACCESS_TIME, imgVersion)) {
658      atime = in.readLong();
659    }
660    final long blockSize = in.readLong();
661    final int numBlocks = in.readInt();
662
663    if (numBlocks >= 0) {
664      // file
665      
666      // read blocks
667      BlockInfo[] blocks = null;
668      if (numBlocks >= 0) {
669        blocks = new BlockInfo[numBlocks];
670        for (int j = 0; j < numBlocks; j++) {
671          blocks[j] = new BlockInfo(replication);
672          blocks[j].readFields(in);
673        }
674      }
675
676      String clientName = "";
677      String clientMachine = "";
678      boolean underConstruction = false;
679      FileDiffList fileDiffs = null;
680      if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
681        // read diffs
682        fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
683
684        if (isSnapshotINode) {
685          underConstruction = in.readBoolean();
686          if (underConstruction) {
687            clientName = FSImageSerialization.readString(in);
688            clientMachine = FSImageSerialization.readString(in);
689            // convert the last block to BlockUC
690            if (blocks != null && blocks.length > 0) {
691              BlockInfo lastBlk = blocks[blocks.length - 1]; 
692              blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
693                  lastBlk, replication);
694            }
695          }
696        }
697      }
698
699      final PermissionStatus permissions = PermissionStatus.read(in);
700
701      // return
702      if (counter != null) {
703        counter.increment();
704      }
705      final INodeFile file = new INodeFile(inodeId, localName, permissions,
706          modificationTime, atime, blocks, replication, blockSize);
707      if (underConstruction) {
708        INodeFileUnderConstruction fileUC = new INodeFileUnderConstruction(
709            file, clientName, clientMachine, null);
710        return fileDiffs == null ? fileUC :
711          new INodeFileUnderConstructionWithSnapshot(fileUC, fileDiffs);
712      } else {
713        return fileDiffs == null ? file : 
714          new INodeFileWithSnapshot(file, fileDiffs);
715      }
716    } else if (numBlocks == -1) {
717      //directory
718      
719      //read quotas
720      final long nsQuota = in.readLong();
721      long dsQuota = -1L;
722      if (LayoutVersion.supports(Feature.DISKSPACE_QUOTA, imgVersion)) {
723        dsQuota = in.readLong();
724      }
725
726      //read snapshot info
727      boolean snapshottable = false;
728      boolean withSnapshot = false;
729      if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
730        snapshottable = in.readBoolean();
731        if (!snapshottable) {
732          withSnapshot = in.readBoolean();
733        }
734      }
735
736      final PermissionStatus permissions = PermissionStatus.read(in);
737
738      //return
739      if (counter != null) {
740        counter.increment();
741      }
742      final INodeDirectory dir = nsQuota >= 0 || dsQuota >= 0?
743          new INodeDirectoryWithQuota(inodeId, localName, permissions,
744              modificationTime, nsQuota, dsQuota)
745          : new INodeDirectory(inodeId, localName, permissions, modificationTime);
746      return snapshottable ? new INodeDirectorySnapshottable(dir)
747          : withSnapshot ? new INodeDirectoryWithSnapshot(dir)
748          : dir;
749    } else if (numBlocks == -2) {
750      //symlink
751      if (!FileSystem.areSymlinksEnabled()) {
752        throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
753      }
754
755      final String symlink = Text.readString(in);
756      final PermissionStatus permissions = PermissionStatus.read(in);
757      if (counter != null) {
758        counter.increment();
759      }
760      return new INodeSymlink(inodeId, localName, permissions,
761          modificationTime, atime, symlink);
762    } else if (numBlocks == -3) {
763      //reference
764      // Intentionally do not increment counter, because it is too difficult at
765      // this point to assess whether or not this is a reference that counts
766      // toward quota.
767      
768      final boolean isWithName = in.readBoolean();
769      // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
770      int snapshotId = in.readInt();
771      
772      final INodeReference.WithCount withCount
773          = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
774
775      if (isWithName) {
776          return new INodeReference.WithName(null, withCount, localName,
777              snapshotId);
778      } else {
779        final INodeReference ref = new INodeReference.DstReference(null,
780            withCount, snapshotId);
781        return ref;
782      }
783    }
784    
785    throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
786  }
787
788    /** Load {@link INodeFileAttributes}. */
789    public INodeFileAttributes loadINodeFileAttributes(DataInput in)
790        throws IOException {
791      final int layoutVersion = getLayoutVersion();
792      
793      if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
794        return loadINodeWithLocalName(true, in, false).asFile();
795      }
796  
797      final byte[] name = FSImageSerialization.readLocalName(in);
798      final PermissionStatus permissions = PermissionStatus.read(in);
799      final long modificationTime = in.readLong();
800      final long accessTime = in.readLong();
801  
802      final short replication = namesystem.getBlockManager().adjustReplication(
803          in.readShort());
804      final long preferredBlockSize = in.readLong();
805      
806      return new INodeFileAttributes.SnapshotCopy(name, permissions, modificationTime,
807          accessTime, replication, preferredBlockSize);
808    }
809
810    public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
811        throws IOException {
812      final int layoutVersion = getLayoutVersion();
813      
814      if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
815        return loadINodeWithLocalName(true, in, false).asDirectory();
816      }
817  
818      final byte[] name = FSImageSerialization.readLocalName(in);
819      final PermissionStatus permissions = PermissionStatus.read(in);
820      final long modificationTime = in.readLong();
821      
822      //read quotas
823      final long nsQuota = in.readLong();
824      final long dsQuota = in.readLong();
825  
826      return nsQuota == -1L && dsQuota == -1L?
827          new INodeDirectoryAttributes.SnapshotCopy(name, permissions, modificationTime)
828        : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
829            modificationTime, nsQuota, dsQuota);
830    }
831  
832    private void loadFilesUnderConstruction(DataInput in,
833        boolean supportSnapshot, Counter counter) throws IOException {
834      FSDirectory fsDir = namesystem.dir;
835      int size = in.readInt();
836
837      LOG.info("Number of files under construction = " + size);
838
839      for (int i = 0; i < size; i++) {
840        INodeFileUnderConstruction cons = FSImageSerialization
841            .readINodeUnderConstruction(in, namesystem, getLayoutVersion());
842        counter.increment();
843
844        // verify that file exists in namespace
845        String path = cons.getLocalName();
846        INodeFile oldnode = null;
847        boolean inSnapshot = false;
848        if (path != null && FSDirectory.isReservedName(path) && 
849            LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
850          // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in 
851          // snapshot. If we support INode ID in the layout version, we can use
852          // the inode id to find the oldnode.
853          oldnode = namesystem.dir.getInode(cons.getId()).asFile();
854          inSnapshot = true;
855        } else {
856          final INodesInPath iip = fsDir.getLastINodeInPath(path);
857          oldnode = INodeFile.valueOf(iip.getINode(0), path);
858        }
859        
860        cons.setLocalName(oldnode.getLocalNameBytes());
861        INodeReference parentRef = oldnode.getParentReference();
862        if (parentRef != null) {
863          cons.setParentReference(parentRef);
864        } else {
865          cons.setParent(oldnode.getParent());
866        }
867
868        if (oldnode instanceof INodeFileWithSnapshot) {
869          cons = new INodeFileUnderConstructionWithSnapshot(cons,
870              ((INodeFileWithSnapshot) oldnode).getDiffs());
871        }
872
873        if (!inSnapshot) {
874          fsDir.replaceINodeFile(path, oldnode, cons);
875          namesystem.leaseManager.addLease(cons.getClientName(), path);
876        } else {
877          if (parentRef != null) {
878            // replace oldnode with cons
879            parentRef.setReferredINode(cons);
880          } else {
881            // replace old node in its parent's children list and deleted list
882            oldnode.getParent().replaceChildFileInSnapshot(oldnode, cons);
883            namesystem.dir.addToInodeMap(cons);
884            updateBlocksMap(cons);
885          }
886        }
887      }
888    }
889
890    private void loadSecretManagerState(DataInput in)
891        throws IOException {
892      int imgVersion = getLayoutVersion();
893
894      if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) {
895        //SecretManagerState is not available.
896        //This must not happen if security is turned on.
897        return; 
898      }
899      namesystem.loadSecretManagerState(in);
900    }
901
902    private void loadCacheManagerState(DataInput in) throws IOException {
903      int imgVersion = getLayoutVersion();
904      if (!LayoutVersion.supports(Feature.CACHING, imgVersion)) {
905        return;
906      }
907      namesystem.getCacheManager().loadState(in);
908    }
909
910    private int getLayoutVersion() {
911      return namesystem.getFSImage().getStorage().getLayoutVersion();
912    }
913
914    private boolean isRoot(byte[][] path) {
915      return path.length == 1 &&
916        path[0] == null;    
917    }
918
919    private boolean isParent(byte[][] path, byte[][] parent) {
920      if (path == null || parent == null)
921        return false;
922      if (parent.length == 0 || path.length != parent.length + 1)
923        return false;
924      boolean isParent = true;
925      for (int i = 0; i < parent.length; i++) {
926        isParent = isParent && Arrays.equals(path[i], parent[i]); 
927      }
928      return isParent;
929    }
930
931    /**
932     * Return string representing the parent of the given path.
933     */
934    String getParent(String path) {
935      return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
936    }
937    
938    byte[][] getParent(byte[][] path) {
939      byte[][] result = new byte[path.length - 1][];
940      for (int i = 0; i < result.length; i++) {
941        result[i] = new byte[path[i].length];
942        System.arraycopy(path[i], 0, result[i], 0, path[i].length);
943      }
944      return result;
945    }
946    
947    public Snapshot getSnapshot(DataInput in) throws IOException {
948      return snapshotMap.get(in.readInt());
949    }
950  }
951  
952  /**
953   * A one-shot class responsible for writing an image file.
954   * The write() function should be called once, after which the getter
955   * functions may be used to retrieve information about the file that was written.
956   */
957  static class Saver {
958    private final SaveNamespaceContext context;
959    /** Set to true once an image has been written */
960    private boolean saved = false;
961    
962    /** The MD5 checksum of the file that was written */
963    private MD5Hash savedDigest;
964    private final ReferenceMap referenceMap = new ReferenceMap();
965    
966    private final Map<Long, INodeFileUnderConstruction> snapshotUCMap = 
967        new HashMap<Long, INodeFileUnderConstruction>();
968
969    /** @throws IllegalStateException if the instance has not yet saved an image */
970    private void checkSaved() {
971      if (!saved) {
972        throw new IllegalStateException("FSImageSaver has not saved an image");
973      }
974    }
975    
976    /** @throws IllegalStateException if the instance has already saved an image */
977    private void checkNotSaved() {
978      if (saved) {
979        throw new IllegalStateException("FSImageSaver has already saved an image");
980      }
981    }
982    
983
984    Saver(SaveNamespaceContext context) {
985      this.context = context;
986    }
987
988    /**
989     * Return the MD5 checksum of the image file that was saved.
990     */
991    MD5Hash getSavedDigest() {
992      checkSaved();
993      return savedDigest;
994    }
995
996    void save(File newFile, FSImageCompression compression) throws IOException {
997      checkNotSaved();
998
999      final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
1000      FSDirectory fsDir = sourceNamesystem.dir;
1001      String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
1002      Step step = new Step(StepType.INODES, sdPath);
1003      StartupProgress prog = NameNode.getStartupProgress();
1004      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
1005      prog.setTotal(Phase.SAVING_CHECKPOINT, step,
1006        fsDir.rootDir.numItemsInTree());
1007      Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
1008      long startTime = now();
1009      //
1010      // Write out data
1011      //
1012      MessageDigest digester = MD5Hash.getDigester();
1013      FileOutputStream fout = new FileOutputStream(newFile);
1014      DigestOutputStream fos = new DigestOutputStream(fout, digester);
1015      DataOutputStream out = new DataOutputStream(fos);
1016      try {
1017        out.writeInt(HdfsConstants.LAYOUT_VERSION);
1018        LayoutFlags.write(out);
1019        // We use the non-locked version of getNamespaceInfo here since
1020        // the coordinating thread of saveNamespace already has read-locked
1021        // the namespace for us. If we attempt to take another readlock
1022        // from the actual saver thread, there's a potential of a
1023        // fairness-related deadlock. See the comments on HDFS-2223.
1024        out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1025            .getNamespaceID());
1026        out.writeLong(fsDir.rootDir.numItemsInTree());
1027        out.writeLong(sourceNamesystem.getGenerationStampV1());
1028        out.writeLong(sourceNamesystem.getGenerationStampV2());
1029        out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
1030        out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
1031        out.writeLong(context.getTxId());
1032        out.writeLong(sourceNamesystem.getLastInodeId());
1033
1034        
1035        sourceNamesystem.getSnapshotManager().write(out);
1036        
1037        // write compression info and set up compressed stream
1038        out = compression.writeHeaderAndWrapStream(fos);
1039        LOG.info("Saving image file " + newFile +
1040                 " using " + compression);
1041
1042        // save the root
1043        saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
1044        // save the rest of the nodes
1045        saveImage(fsDir.rootDir, out, true, false, counter);
1046        prog.endStep(Phase.SAVING_CHECKPOINT, step);
1047        // Now that the step is finished, set counter equal to total to adjust
1048        // for possible under-counting due to reference inodes.
1049        prog.setCount(Phase.SAVING_CHECKPOINT, step,
1050          fsDir.rootDir.numItemsInTree());
1051        // save files under construction
1052        // TODO: for HDFS-5428, since we cannot break the compatibility of 
1053        // fsimage, we store part of the under-construction files that are only
1054        // in snapshots in this "under-construction-file" section. As a 
1055        // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their 
1056        // paths, so that when loading fsimage we do not put them into the lease
1057        // map. In the future, we can remove this hack when we can bump the 
1058        // layout version.
1059        sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
1060        
1061        context.checkCancelled();
1062        sourceNamesystem.saveSecretManagerState(out, sdPath);
1063        context.checkCancelled();
1064        sourceNamesystem.getCacheManager().saveState(out, sdPath);
1065        context.checkCancelled();
1066        out.flush();
1067        context.checkCancelled();
1068        fout.getChannel().force(true);
1069      } finally {
1070        out.close();
1071      }
1072
1073      saved = true;
1074      // set md5 of the saved image
1075      savedDigest = new MD5Hash(digester.digest());
1076
1077      LOG.info("Image file " + newFile + " of size " + newFile.length() +
1078          " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1079    }
1080
1081    /**
1082     * Save children INodes.
1083     * @param children The list of children INodes
1084     * @param out The DataOutputStream to write
1085     * @param inSnapshot Whether the parent directory or its ancestor is in 
1086     *                   the deleted list of some snapshot (caused by rename or 
1087     *                   deletion)
1088     * @param counter Counter to increment for namenode startup progress
1089     * @return Number of children that are directory
1090     */
1091    private int saveChildren(ReadOnlyList<INode> children,
1092        DataOutputStream out, boolean inSnapshot, Counter counter)
1093        throws IOException {
1094      // Write normal children INode. 
1095      out.writeInt(children.size());
1096      int dirNum = 0;
1097      int i = 0;
1098      for(INode child : children) {
1099        // print all children first
1100        // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1101        // here, thus even if the parent directory is in snapshot, we still
1102        // do not handle INodeUC as those stored in deleted list
1103        saveINode2Image(child, out, false, referenceMap, counter);
1104        if (child.isDirectory()) {
1105          dirNum++;
1106        } else if (inSnapshot && child.isFile()
1107            && child.asFile().isUnderConstruction()) {
1108          this.snapshotUCMap.put(child.getId(),
1109              (INodeFileUnderConstruction) child.asFile());
1110        }
1111        if (i++ % 50 == 0) {
1112          context.checkCancelled();
1113        }
1114      }
1115      return dirNum;
1116    }
1117    
1118    /**
1119     * Save file tree image starting from the given root.
1120     * This is a recursive procedure, which first saves all children and 
1121     * snapshot diffs of a current directory and then moves inside the 
1122     * sub-directories.
1123     * 
1124     * @param current The current node
1125     * @param out The DataoutputStream to write the image
1126     * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1127     *                      reference node, its subtree may already have been
1128     *                      saved before.
1129     * @param inSnapshot Whether the current directory is in snapshot
1130     * @param counter Counter to increment for namenode startup progress
1131     */
1132    private void saveImage(INodeDirectory current, DataOutputStream out,
1133        boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1134        throws IOException {
1135      // write the inode id of the directory
1136      out.writeLong(current.getId());
1137      
1138      if (!toSaveSubtree) {
1139        return;
1140      }
1141      
1142      final ReadOnlyList<INode> children = current.getChildrenList(null);
1143      int dirNum = 0;
1144      List<INodeDirectory> snapshotDirs = null;
1145      if (current instanceof INodeDirectoryWithSnapshot) {
1146        snapshotDirs = new ArrayList<INodeDirectory>();
1147        ((INodeDirectoryWithSnapshot) current).getSnapshotDirectory(
1148            snapshotDirs);
1149        dirNum += snapshotDirs.size();
1150      }
1151      
1152      // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1153      // Snapshots
1154      if (current instanceof INodeDirectorySnapshottable) {
1155        INodeDirectorySnapshottable snapshottableNode = 
1156            (INodeDirectorySnapshottable) current;
1157        SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
1158      } else {
1159        out.writeInt(-1); // # of snapshots
1160      }
1161
1162      // 3. Write children INode 
1163      dirNum += saveChildren(children, out, inSnapshot, counter);
1164      
1165      // 4. Write DirectoryDiff lists, if there is any.
1166      SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1167      
1168      // Write sub-tree of sub-directories, including possible snapshots of 
1169      // deleted sub-directories
1170      out.writeInt(dirNum); // the number of sub-directories
1171      for(INode child : children) {
1172        if(!child.isDirectory()) {
1173          continue;
1174        }
1175        // make sure we only save the subtree under a reference node once
1176        boolean toSave = child.isReference() ? 
1177            referenceMap.toProcessSubtree(child.getId()) : true;
1178        saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1179      }
1180      if (snapshotDirs != null) {
1181        for (INodeDirectory subDir : snapshotDirs) {
1182          // make sure we only save the subtree under a reference node once
1183          boolean toSave = subDir.getParentReference() != null ? 
1184              referenceMap.toProcessSubtree(subDir.getId()) : true;
1185          saveImage(subDir, out, toSave, true, counter);
1186        }
1187      }
1188    }
1189
1190    /**
1191     * Saves inode and increments progress counter.
1192     * 
1193     * @param inode INode to save
1194     * @param out DataOutputStream to receive inode
1195     * @param writeUnderConstruction boolean true if this is under construction
1196     * @param referenceMap ReferenceMap containing reference inodes
1197     * @param counter Counter to increment for namenode startup progress
1198     * @throws IOException thrown if there is an I/O error
1199     */
1200    private void saveINode2Image(INode inode, DataOutputStream out,
1201        boolean writeUnderConstruction, ReferenceMap referenceMap,
1202        Counter counter) throws IOException {
1203      FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1204        referenceMap);
1205      // Intentionally do not increment counter for reference inodes, because it
1206      // is too difficult at this point to assess whether or not this is a
1207      // reference that counts toward quota.
1208      if (!(inode instanceof INodeReference)) {
1209        counter.increment();
1210      }
1211    }
1212  }
1213}