001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.util.Time.now; 021 022import java.io.DataInput; 023import java.io.DataInputStream; 024import java.io.DataOutputStream; 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileNotFoundException; 028import java.io.FileOutputStream; 029import java.io.IOException; 030import java.security.DigestInputStream; 031import java.security.DigestOutputStream; 032import java.security.MessageDigest; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.HashMap; 036import java.util.List; 037import java.util.Map; 038 039import org.apache.commons.logging.Log; 040import org.apache.hadoop.HadoopIllegalArgumentException; 041import org.apache.hadoop.classification.InterfaceAudience; 042import org.apache.hadoop.classification.InterfaceStability; 043import org.apache.hadoop.conf.Configuration; 044import org.apache.hadoop.fs.FileSystem; 045import org.apache.hadoop.fs.Path; 046import org.apache.hadoop.fs.PathIsNotDirectoryException; 047import org.apache.hadoop.fs.UnresolvedLinkException; 048import org.apache.hadoop.fs.permission.PermissionStatus; 049import org.apache.hadoop.hdfs.protocol.HdfsConstants; 050import org.apache.hadoop.hdfs.protocol.LayoutVersion; 051import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; 052import org.apache.hadoop.hdfs.protocol.LayoutFlags; 053import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; 054import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; 055import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; 056import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; 057import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList; 058import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable; 059import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectoryWithSnapshot; 060import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileUnderConstructionWithSnapshot; 061import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot; 062import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; 063import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat; 064import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap; 065import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; 066import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress; 067import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter; 068import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step; 069import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType; 070import org.apache.hadoop.hdfs.util.ReadOnlyList; 071import org.apache.hadoop.io.MD5Hash; 072import org.apache.hadoop.io.Text; 073 074/** 075 * Contains inner classes for reading or writing the on-disk format for 076 * FSImages. 077 * 078 * In particular, the format of the FSImage looks like: 079 * <pre> 080 * FSImage { 081 * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long, 082 * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long, 083 * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId: 084 * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int, 085 * numOfSnapshottableDirs: int, 086 * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed) 087 * } 088 * 089 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) { 090 * INodeInfo of root, numberOfChildren of root: int 091 * [list of INodeInfo of root's children], 092 * [list of INodeDirectoryInfo of root's directory children] 093 * } 094 * 095 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){ 096 * [list of INodeInfo of INodes in topological order] 097 * } 098 * 099 * INodeInfo { 100 * { 101 * localName: short + byte[] 102 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported 103 * or 104 * { 105 * fullPath: byte[] 106 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported 107 * replicationFactor: short, modificationTime: long, 108 * accessTime: long, preferredBlockSize: long, 109 * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink), 110 * { 111 * nsQuota: long, dsQuota: long, 112 * { 113 * isINodeSnapshottable: byte, 114 * isINodeWithSnapshot: byte (if isINodeSnapshottable is false) 115 * } (when {@link Feature#SNAPSHOT} is supported), 116 * fsPermission: short, PermissionStatus 117 * } for INodeDirectory 118 * or 119 * { 120 * symlinkString, fsPermission: short, PermissionStatus 121 * } for INodeSymlink 122 * or 123 * { 124 * [list of BlockInfo] 125 * [list of FileDiff] 126 * { 127 * isINodeFileUnderConstructionSnapshot: byte, 128 * {clientName: short + byte[], clientMachine: short + byte[]} (when 129 * isINodeFileUnderConstructionSnapshot is true), 130 * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 131 * fsPermission: short, PermissionStatus 132 * } for INodeFile 133 * } 134 * 135 * INodeDirectoryInfo { 136 * fullPath of the directory: short + byte[], 137 * numberOfChildren: int, [list of INodeInfo of children INode], 138 * { 139 * numberOfSnapshots: int, 140 * [list of Snapshot] (when NumberOfSnapshots is positive), 141 * numberOfDirectoryDiffs: int, 142 * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive), 143 * number of children that are directories, 144 * [list of INodeDirectoryInfo of the directory children] (includes 145 * snapshot copies of deleted sub-directories) 146 * } (when {@link Feature#SNAPSHOT} is supported), 147 * } 148 * 149 * Snapshot { 150 * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 151 * the name of the snapshot) 152 * } 153 * 154 * DirectoryDiff { 155 * full path of the root of the associated Snapshot: short + byte[], 156 * childrenSize: int, 157 * isSnapshotRoot: byte, 158 * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false), 159 * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 160 * } 161 * 162 * Diff { 163 * createdListSize: int, [Local name of INode in created list], 164 * deletedListSize: int, [INode in deleted list: INodeInfo] 165 * } 166 * 167 * FileDiff { 168 * full path of the root of the associated Snapshot: short + byte[], 169 * fileSize: long, 170 * snapshotINodeIsNotNull: byte, 171 * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 172 * } 173 * </pre> 174 */ 175@InterfaceAudience.Private 176@InterfaceStability.Evolving 177public class FSImageFormat { 178 private static final Log LOG = FSImage.LOG; 179 180 // Static-only class 181 private FSImageFormat() {} 182 183 /** 184 * A one-shot class responsible for loading an image. The load() function 185 * should be called once, after which the getter methods may be used to retrieve 186 * information about the image that was loaded, if loading was successful. 187 */ 188 public static class Loader { 189 private final Configuration conf; 190 /** which namesystem this loader is working for */ 191 private final FSNamesystem namesystem; 192 193 /** Set to true once a file has been loaded using this loader. */ 194 private boolean loaded = false; 195 196 /** The transaction ID of the last edit represented by the loaded file */ 197 private long imgTxId; 198 /** The MD5 sum of the loaded file */ 199 private MD5Hash imgDigest; 200 201 private Map<Integer, Snapshot> snapshotMap = null; 202 private final ReferenceMap referenceMap = new ReferenceMap(); 203 204 Loader(Configuration conf, FSNamesystem namesystem) { 205 this.conf = conf; 206 this.namesystem = namesystem; 207 } 208 209 /** 210 * Return the MD5 checksum of the image that has been loaded. 211 * @throws IllegalStateException if load() has not yet been called. 212 */ 213 MD5Hash getLoadedImageMd5() { 214 checkLoaded(); 215 return imgDigest; 216 } 217 218 long getLoadedImageTxId() { 219 checkLoaded(); 220 return imgTxId; 221 } 222 223 /** 224 * Throw IllegalStateException if load() has not yet been called. 225 */ 226 private void checkLoaded() { 227 if (!loaded) { 228 throw new IllegalStateException("Image not yet loaded!"); 229 } 230 } 231 232 /** 233 * Throw IllegalStateException if load() has already been called. 234 */ 235 private void checkNotLoaded() { 236 if (loaded) { 237 throw new IllegalStateException("Image already loaded!"); 238 } 239 } 240 241 void load(File curFile) throws IOException { 242 checkNotLoaded(); 243 assert curFile != null : "curFile is null"; 244 245 StartupProgress prog = NameNode.getStartupProgress(); 246 Step step = new Step(StepType.INODES); 247 prog.beginStep(Phase.LOADING_FSIMAGE, step); 248 long startTime = now(); 249 250 // 251 // Load in bits 252 // 253 MessageDigest digester = MD5Hash.getDigester(); 254 DigestInputStream fin = new DigestInputStream( 255 new FileInputStream(curFile), digester); 256 257 DataInputStream in = new DataInputStream(fin); 258 try { 259 // read image version: first appeared in version -1 260 int imgVersion = in.readInt(); 261 if (getLayoutVersion() != imgVersion) { 262 throw new InconsistentFSStateException(curFile, 263 "imgVersion " + imgVersion + 264 " expected to be " + getLayoutVersion()); 265 } 266 boolean supportSnapshot = LayoutVersion.supports(Feature.SNAPSHOT, 267 imgVersion); 268 if (LayoutVersion.supports(Feature.ADD_LAYOUT_FLAGS, imgVersion)) { 269 LayoutFlags.read(in); 270 } 271 272 // read namespaceID: first appeared in version -2 273 in.readInt(); 274 275 long numFiles = in.readLong(); 276 277 // read in the last generation stamp for legacy blocks. 278 long genstamp = in.readLong(); 279 namesystem.setGenerationStampV1(genstamp); 280 281 if (LayoutVersion.supports(Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) { 282 // read the starting generation stamp for sequential block IDs 283 genstamp = in.readLong(); 284 namesystem.setGenerationStampV2(genstamp); 285 286 // read the last generation stamp for blocks created after 287 // the switch to sequential block IDs. 288 long stampAtIdSwitch = in.readLong(); 289 namesystem.setGenerationStampV1Limit(stampAtIdSwitch); 290 291 // read the max sequential block ID. 292 long maxSequentialBlockId = in.readLong(); 293 namesystem.setLastAllocatedBlockId(maxSequentialBlockId); 294 } else { 295 long startingGenStamp = namesystem.upgradeGenerationStampToV2(); 296 // This is an upgrade. 297 LOG.info("Upgrading to sequential block IDs. Generation stamp " + 298 "for new blocks set to " + startingGenStamp); 299 } 300 301 // read the transaction ID of the last edit represented by 302 // this image 303 if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) { 304 imgTxId = in.readLong(); 305 } else { 306 imgTxId = 0; 307 } 308 309 // read the last allocated inode id in the fsimage 310 if (LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion)) { 311 long lastInodeId = in.readLong(); 312 namesystem.resetLastInodeId(lastInodeId); 313 if (LOG.isDebugEnabled()) { 314 LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId); 315 } 316 } else { 317 if (LOG.isDebugEnabled()) { 318 LOG.debug("Old layout version doesn't have inode id." 319 + " Will assign new id for each inode."); 320 } 321 } 322 323 if (supportSnapshot) { 324 snapshotMap = namesystem.getSnapshotManager().read(in, this); 325 } 326 327 // read compression related info 328 FSImageCompression compression; 329 if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imgVersion)) { 330 compression = FSImageCompression.readCompressionHeader(conf, in); 331 } else { 332 compression = FSImageCompression.createNoopCompression(); 333 } 334 in = compression.unwrapInputStream(fin); 335 336 LOG.info("Loading image file " + curFile + " using " + compression); 337 338 // load all inodes 339 LOG.info("Number of files = " + numFiles); 340 prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles); 341 Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step); 342 if (LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION, 343 imgVersion)) { 344 if (supportSnapshot) { 345 loadLocalNameINodesWithSnapshot(numFiles, in, counter); 346 } else { 347 loadLocalNameINodes(numFiles, in, counter); 348 } 349 } else { 350 loadFullNameINodes(numFiles, in, counter); 351 } 352 353 loadFilesUnderConstruction(in, supportSnapshot, counter); 354 prog.endStep(Phase.LOADING_FSIMAGE, step); 355 // Now that the step is finished, set counter equal to total to adjust 356 // for possible under-counting due to reference inodes. 357 prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles); 358 359 loadSecretManagerState(in); 360 361 loadCacheManagerState(in); 362 363 // make sure to read to the end of file 364 boolean eof = (in.read() == -1); 365 assert eof : "Should have reached the end of image file " + curFile; 366 } finally { 367 in.close(); 368 } 369 370 imgDigest = new MD5Hash(digester.digest()); 371 loaded = true; 372 373 LOG.info("Image file " + curFile + " of size " + curFile.length() + 374 " bytes loaded in " + (now() - startTime)/1000 + " seconds."); 375 } 376 377 /** Update the root node's attributes */ 378 private void updateRootAttr(INodeWithAdditionalFields root) { 379 long nsQuota = root.getNsQuota(); 380 long dsQuota = root.getDsQuota(); 381 FSDirectory fsDir = namesystem.dir; 382 if (nsQuota != -1 || dsQuota != -1) { 383 fsDir.rootDir.setQuota(nsQuota, dsQuota); 384 } 385 fsDir.rootDir.cloneModificationTime(root); 386 fsDir.rootDir.clonePermissionStatus(root); 387 } 388 389 /** 390 * Load fsimage files when 1) only local names are stored, 391 * and 2) snapshot is supported. 392 * 393 * @param numFiles number of files expected to be read 394 * @param in Image input stream 395 * @param counter Counter to increment for namenode startup progress 396 */ 397 private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in, 398 Counter counter) throws IOException { 399 assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION, 400 getLayoutVersion()); 401 assert LayoutVersion.supports(Feature.SNAPSHOT, getLayoutVersion()); 402 403 // load root 404 loadRoot(in, counter); 405 // load rest of the nodes recursively 406 loadDirectoryWithSnapshot(in, counter); 407 } 408 409 /** 410 * load fsimage files assuming only local names are stored 411 * 412 * @param numFiles number of files expected to be read 413 * @param in image input stream 414 * @param counter Counter to increment for namenode startup progress 415 * @throws IOException 416 */ 417 private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter) 418 throws IOException { 419 assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION, 420 getLayoutVersion()); 421 assert numFiles > 0; 422 423 // load root 424 loadRoot(in, counter); 425 // have loaded the first file (the root) 426 numFiles--; 427 428 // load rest of the nodes directory by directory 429 while (numFiles > 0) { 430 numFiles -= loadDirectory(in, counter); 431 } 432 if (numFiles != 0) { 433 throw new IOException("Read unexpect number of files: " + -numFiles); 434 } 435 } 436 437 /** 438 * Load information about root, and use the information to update the root 439 * directory of NameSystem. 440 * @param in The {@link DataInput} instance to read. 441 * @param counter Counter to increment for namenode startup progress 442 */ 443 private void loadRoot(DataInput in, Counter counter) 444 throws IOException { 445 // load root 446 if (in.readShort() != 0) { 447 throw new IOException("First node is not root"); 448 } 449 final INodeDirectory root = loadINode(null, false, in, counter) 450 .asDirectory(); 451 // update the root's attributes 452 updateRootAttr(root); 453 } 454 455 /** Load children nodes for the parent directory. */ 456 private int loadChildren(INodeDirectory parent, DataInput in, 457 Counter counter) throws IOException { 458 int numChildren = in.readInt(); 459 for (int i = 0; i < numChildren; i++) { 460 // load single inode 461 INode newNode = loadINodeWithLocalName(false, in, true, counter); 462 addToParent(parent, newNode); 463 } 464 return numChildren; 465 } 466 467 /** 468 * Load a directory when snapshot is supported. 469 * @param in The {@link DataInput} instance to read. 470 * @param counter Counter to increment for namenode startup progress 471 */ 472 private void loadDirectoryWithSnapshot(DataInput in, Counter counter) 473 throws IOException { 474 // Step 1. Identify the parent INode 475 long inodeId = in.readLong(); 476 final INodeDirectory parent = this.namesystem.dir.getInode(inodeId) 477 .asDirectory(); 478 479 // Check if the whole subtree has been saved (for reference nodes) 480 boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId()); 481 if (!toLoadSubtree) { 482 return; 483 } 484 485 // Step 2. Load snapshots if parent is snapshottable 486 int numSnapshots = in.readInt(); 487 if (numSnapshots >= 0) { 488 final INodeDirectorySnapshottable snapshottableParent 489 = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName()); 490 // load snapshots and snapshotQuota 491 SnapshotFSImageFormat.loadSnapshotList(snapshottableParent, 492 numSnapshots, in, this); 493 if (snapshottableParent.getSnapshotQuota() > 0) { 494 // add the directory to the snapshottable directory list in 495 // SnapshotManager. Note that we only add root when its snapshot quota 496 // is positive. 497 this.namesystem.getSnapshotManager().addSnapshottable( 498 snapshottableParent); 499 } 500 } 501 502 // Step 3. Load children nodes under parent 503 loadChildren(parent, in, counter); 504 505 // Step 4. load Directory Diff List 506 SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this); 507 508 // Recursively load sub-directories, including snapshot copies of deleted 509 // directories 510 int numSubTree = in.readInt(); 511 for (int i = 0; i < numSubTree; i++) { 512 loadDirectoryWithSnapshot(in, counter); 513 } 514 } 515 516 /** 517 * Load all children of a directory 518 * 519 * @param in 520 * @param counter Counter to increment for namenode startup progress 521 * @return number of child inodes read 522 * @throws IOException 523 */ 524 private int loadDirectory(DataInput in, Counter counter) throws IOException { 525 String parentPath = FSImageSerialization.readString(in); 526 final INodeDirectory parent = INodeDirectory.valueOf( 527 namesystem.dir.rootDir.getNode(parentPath, true), parentPath); 528 return loadChildren(parent, in, counter); 529 } 530 531 /** 532 * load fsimage files assuming full path names are stored 533 * 534 * @param numFiles total number of files to load 535 * @param in data input stream 536 * @param counter Counter to increment for namenode startup progress 537 * @throws IOException if any error occurs 538 */ 539 private void loadFullNameINodes(long numFiles, DataInput in, Counter counter) 540 throws IOException { 541 byte[][] pathComponents; 542 byte[][] parentPath = {{}}; 543 FSDirectory fsDir = namesystem.dir; 544 INodeDirectory parentINode = fsDir.rootDir; 545 for (long i = 0; i < numFiles; i++) { 546 pathComponents = FSImageSerialization.readPathComponents(in); 547 final INode newNode = loadINode( 548 pathComponents[pathComponents.length-1], false, in, counter); 549 550 if (isRoot(pathComponents)) { // it is the root 551 // update the root's attributes 552 updateRootAttr(newNode.asDirectory()); 553 continue; 554 } 555 // check if the new inode belongs to the same parent 556 if(!isParent(pathComponents, parentPath)) { 557 parentINode = getParentINodeDirectory(pathComponents); 558 parentPath = getParent(pathComponents); 559 } 560 561 // add new inode 562 addToParent(parentINode, newNode); 563 } 564 } 565 566 private INodeDirectory getParentINodeDirectory(byte[][] pathComponents 567 ) throws FileNotFoundException, PathIsNotDirectoryException, 568 UnresolvedLinkException { 569 if (pathComponents.length < 2) { // root 570 return null; 571 } 572 // Gets the parent INode 573 final INodesInPath inodes = namesystem.dir.getExistingPathINodes( 574 pathComponents); 575 return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents); 576 } 577 578 /** 579 * Add the child node to parent and, if child is a file, update block map. 580 * This method is only used for image loading so that synchronization, 581 * modification time update and space count update are not needed. 582 */ 583 private void addToParent(INodeDirectory parent, INode child) { 584 FSDirectory fsDir = namesystem.dir; 585 if (parent == fsDir.rootDir && FSDirectory.isReservedName(child)) { 586 throw new HadoopIllegalArgumentException("File name \"" 587 + child.getLocalName() + "\" is reserved. Please " 588 + " change the name of the existing file or directory to another " 589 + "name before upgrading to this release."); 590 } 591 // NOTE: This does not update space counts for parents 592 if (!parent.addChild(child)) { 593 return; 594 } 595 namesystem.dir.cacheName(child); 596 597 if (child.isFile()) { 598 updateBlocksMap(child.asFile()); 599 } 600 } 601 602 public void updateBlocksMap(INodeFile file) { 603 // Add file->block mapping 604 final BlockInfo[] blocks = file.getBlocks(); 605 if (blocks != null) { 606 final BlockManager bm = namesystem.getBlockManager(); 607 for (int i = 0; i < blocks.length; i++) { 608 file.setBlock(i, bm.addBlockCollection(blocks[i], file)); 609 } 610 } 611 } 612 613 /** @return The FSDirectory of the namesystem where the fsimage is loaded */ 614 public FSDirectory getFSDirectoryInLoading() { 615 return namesystem.dir; 616 } 617 618 public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in, 619 boolean updateINodeMap) throws IOException { 620 return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null); 621 } 622 623 public INode loadINodeWithLocalName(boolean isSnapshotINode, 624 DataInput in, boolean updateINodeMap, Counter counter) 625 throws IOException { 626 final byte[] localName = FSImageSerialization.readLocalName(in); 627 INode inode = loadINode(localName, isSnapshotINode, in, counter); 628 if (updateINodeMap 629 && LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) { 630 namesystem.dir.addToInodeMap(inode); 631 } 632 return inode; 633 } 634 635 /** 636 * load an inode from fsimage except for its name 637 * 638 * @param in data input stream from which image is read 639 * @param counter Counter to increment for namenode startup progress 640 * @return an inode 641 */ 642 @SuppressWarnings("deprecation") 643 INode loadINode(final byte[] localName, boolean isSnapshotINode, 644 DataInput in, Counter counter) throws IOException { 645 final int imgVersion = getLayoutVersion(); 646 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) { 647 namesystem.getFSDirectory().verifyINodeName(localName); 648 } 649 650 long inodeId = LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion) ? 651 in.readLong() : namesystem.allocateNewInodeId(); 652 653 final short replication = namesystem.getBlockManager().adjustReplication( 654 in.readShort()); 655 final long modificationTime = in.readLong(); 656 long atime = 0; 657 if (LayoutVersion.supports(Feature.FILE_ACCESS_TIME, imgVersion)) { 658 atime = in.readLong(); 659 } 660 final long blockSize = in.readLong(); 661 final int numBlocks = in.readInt(); 662 663 if (numBlocks >= 0) { 664 // file 665 666 // read blocks 667 BlockInfo[] blocks = null; 668 if (numBlocks >= 0) { 669 blocks = new BlockInfo[numBlocks]; 670 for (int j = 0; j < numBlocks; j++) { 671 blocks[j] = new BlockInfo(replication); 672 blocks[j].readFields(in); 673 } 674 } 675 676 String clientName = ""; 677 String clientMachine = ""; 678 boolean underConstruction = false; 679 FileDiffList fileDiffs = null; 680 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) { 681 // read diffs 682 fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this); 683 684 if (isSnapshotINode) { 685 underConstruction = in.readBoolean(); 686 if (underConstruction) { 687 clientName = FSImageSerialization.readString(in); 688 clientMachine = FSImageSerialization.readString(in); 689 // convert the last block to BlockUC 690 if (blocks != null && blocks.length > 0) { 691 BlockInfo lastBlk = blocks[blocks.length - 1]; 692 blocks[blocks.length - 1] = new BlockInfoUnderConstruction( 693 lastBlk, replication); 694 } 695 } 696 } 697 } 698 699 final PermissionStatus permissions = PermissionStatus.read(in); 700 701 // return 702 if (counter != null) { 703 counter.increment(); 704 } 705 final INodeFile file = new INodeFile(inodeId, localName, permissions, 706 modificationTime, atime, blocks, replication, blockSize); 707 if (underConstruction) { 708 INodeFileUnderConstruction fileUC = new INodeFileUnderConstruction( 709 file, clientName, clientMachine, null); 710 return fileDiffs == null ? fileUC : 711 new INodeFileUnderConstructionWithSnapshot(fileUC, fileDiffs); 712 } else { 713 return fileDiffs == null ? file : 714 new INodeFileWithSnapshot(file, fileDiffs); 715 } 716 } else if (numBlocks == -1) { 717 //directory 718 719 //read quotas 720 final long nsQuota = in.readLong(); 721 long dsQuota = -1L; 722 if (LayoutVersion.supports(Feature.DISKSPACE_QUOTA, imgVersion)) { 723 dsQuota = in.readLong(); 724 } 725 726 //read snapshot info 727 boolean snapshottable = false; 728 boolean withSnapshot = false; 729 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) { 730 snapshottable = in.readBoolean(); 731 if (!snapshottable) { 732 withSnapshot = in.readBoolean(); 733 } 734 } 735 736 final PermissionStatus permissions = PermissionStatus.read(in); 737 738 //return 739 if (counter != null) { 740 counter.increment(); 741 } 742 final INodeDirectory dir = nsQuota >= 0 || dsQuota >= 0? 743 new INodeDirectoryWithQuota(inodeId, localName, permissions, 744 modificationTime, nsQuota, dsQuota) 745 : new INodeDirectory(inodeId, localName, permissions, modificationTime); 746 return snapshottable ? new INodeDirectorySnapshottable(dir) 747 : withSnapshot ? new INodeDirectoryWithSnapshot(dir) 748 : dir; 749 } else if (numBlocks == -2) { 750 //symlink 751 if (!FileSystem.areSymlinksEnabled()) { 752 throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS"); 753 } 754 755 final String symlink = Text.readString(in); 756 final PermissionStatus permissions = PermissionStatus.read(in); 757 if (counter != null) { 758 counter.increment(); 759 } 760 return new INodeSymlink(inodeId, localName, permissions, 761 modificationTime, atime, symlink); 762 } else if (numBlocks == -3) { 763 //reference 764 // Intentionally do not increment counter, because it is too difficult at 765 // this point to assess whether or not this is a reference that counts 766 // toward quota. 767 768 final boolean isWithName = in.readBoolean(); 769 // lastSnapshotId for WithName node, dstSnapshotId for DstReference node 770 int snapshotId = in.readInt(); 771 772 final INodeReference.WithCount withCount 773 = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this); 774 775 if (isWithName) { 776 return new INodeReference.WithName(null, withCount, localName, 777 snapshotId); 778 } else { 779 final INodeReference ref = new INodeReference.DstReference(null, 780 withCount, snapshotId); 781 return ref; 782 } 783 } 784 785 throw new IOException("Unknown inode type: numBlocks=" + numBlocks); 786 } 787 788 /** Load {@link INodeFileAttributes}. */ 789 public INodeFileAttributes loadINodeFileAttributes(DataInput in) 790 throws IOException { 791 final int layoutVersion = getLayoutVersion(); 792 793 if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 794 return loadINodeWithLocalName(true, in, false).asFile(); 795 } 796 797 final byte[] name = FSImageSerialization.readLocalName(in); 798 final PermissionStatus permissions = PermissionStatus.read(in); 799 final long modificationTime = in.readLong(); 800 final long accessTime = in.readLong(); 801 802 final short replication = namesystem.getBlockManager().adjustReplication( 803 in.readShort()); 804 final long preferredBlockSize = in.readLong(); 805 806 return new INodeFileAttributes.SnapshotCopy(name, permissions, modificationTime, 807 accessTime, replication, preferredBlockSize); 808 } 809 810 public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in) 811 throws IOException { 812 final int layoutVersion = getLayoutVersion(); 813 814 if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 815 return loadINodeWithLocalName(true, in, false).asDirectory(); 816 } 817 818 final byte[] name = FSImageSerialization.readLocalName(in); 819 final PermissionStatus permissions = PermissionStatus.read(in); 820 final long modificationTime = in.readLong(); 821 822 //read quotas 823 final long nsQuota = in.readLong(); 824 final long dsQuota = in.readLong(); 825 826 return nsQuota == -1L && dsQuota == -1L? 827 new INodeDirectoryAttributes.SnapshotCopy(name, permissions, modificationTime) 828 : new INodeDirectoryAttributes.CopyWithQuota(name, permissions, 829 modificationTime, nsQuota, dsQuota); 830 } 831 832 private void loadFilesUnderConstruction(DataInput in, 833 boolean supportSnapshot, Counter counter) throws IOException { 834 FSDirectory fsDir = namesystem.dir; 835 int size = in.readInt(); 836 837 LOG.info("Number of files under construction = " + size); 838 839 for (int i = 0; i < size; i++) { 840 INodeFileUnderConstruction cons = FSImageSerialization 841 .readINodeUnderConstruction(in, namesystem, getLayoutVersion()); 842 counter.increment(); 843 844 // verify that file exists in namespace 845 String path = cons.getLocalName(); 846 INodeFile oldnode = null; 847 boolean inSnapshot = false; 848 if (path != null && FSDirectory.isReservedName(path) && 849 LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) { 850 // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in 851 // snapshot. If we support INode ID in the layout version, we can use 852 // the inode id to find the oldnode. 853 oldnode = namesystem.dir.getInode(cons.getId()).asFile(); 854 inSnapshot = true; 855 } else { 856 final INodesInPath iip = fsDir.getLastINodeInPath(path); 857 oldnode = INodeFile.valueOf(iip.getINode(0), path); 858 } 859 860 cons.setLocalName(oldnode.getLocalNameBytes()); 861 INodeReference parentRef = oldnode.getParentReference(); 862 if (parentRef != null) { 863 cons.setParentReference(parentRef); 864 } else { 865 cons.setParent(oldnode.getParent()); 866 } 867 868 if (oldnode instanceof INodeFileWithSnapshot) { 869 cons = new INodeFileUnderConstructionWithSnapshot(cons, 870 ((INodeFileWithSnapshot) oldnode).getDiffs()); 871 } 872 873 if (!inSnapshot) { 874 fsDir.replaceINodeFile(path, oldnode, cons); 875 namesystem.leaseManager.addLease(cons.getClientName(), path); 876 } else { 877 if (parentRef != null) { 878 // replace oldnode with cons 879 parentRef.setReferredINode(cons); 880 } else { 881 // replace old node in its parent's children list and deleted list 882 oldnode.getParent().replaceChildFileInSnapshot(oldnode, cons); 883 namesystem.dir.addToInodeMap(cons); 884 updateBlocksMap(cons); 885 } 886 } 887 } 888 } 889 890 private void loadSecretManagerState(DataInput in) 891 throws IOException { 892 int imgVersion = getLayoutVersion(); 893 894 if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) { 895 //SecretManagerState is not available. 896 //This must not happen if security is turned on. 897 return; 898 } 899 namesystem.loadSecretManagerState(in); 900 } 901 902 private void loadCacheManagerState(DataInput in) throws IOException { 903 int imgVersion = getLayoutVersion(); 904 if (!LayoutVersion.supports(Feature.CACHING, imgVersion)) { 905 return; 906 } 907 namesystem.getCacheManager().loadState(in); 908 } 909 910 private int getLayoutVersion() { 911 return namesystem.getFSImage().getStorage().getLayoutVersion(); 912 } 913 914 private boolean isRoot(byte[][] path) { 915 return path.length == 1 && 916 path[0] == null; 917 } 918 919 private boolean isParent(byte[][] path, byte[][] parent) { 920 if (path == null || parent == null) 921 return false; 922 if (parent.length == 0 || path.length != parent.length + 1) 923 return false; 924 boolean isParent = true; 925 for (int i = 0; i < parent.length; i++) { 926 isParent = isParent && Arrays.equals(path[i], parent[i]); 927 } 928 return isParent; 929 } 930 931 /** 932 * Return string representing the parent of the given path. 933 */ 934 String getParent(String path) { 935 return path.substring(0, path.lastIndexOf(Path.SEPARATOR)); 936 } 937 938 byte[][] getParent(byte[][] path) { 939 byte[][] result = new byte[path.length - 1][]; 940 for (int i = 0; i < result.length; i++) { 941 result[i] = new byte[path[i].length]; 942 System.arraycopy(path[i], 0, result[i], 0, path[i].length); 943 } 944 return result; 945 } 946 947 public Snapshot getSnapshot(DataInput in) throws IOException { 948 return snapshotMap.get(in.readInt()); 949 } 950 } 951 952 /** 953 * A one-shot class responsible for writing an image file. 954 * The write() function should be called once, after which the getter 955 * functions may be used to retrieve information about the file that was written. 956 */ 957 static class Saver { 958 private final SaveNamespaceContext context; 959 /** Set to true once an image has been written */ 960 private boolean saved = false; 961 962 /** The MD5 checksum of the file that was written */ 963 private MD5Hash savedDigest; 964 private final ReferenceMap referenceMap = new ReferenceMap(); 965 966 private final Map<Long, INodeFileUnderConstruction> snapshotUCMap = 967 new HashMap<Long, INodeFileUnderConstruction>(); 968 969 /** @throws IllegalStateException if the instance has not yet saved an image */ 970 private void checkSaved() { 971 if (!saved) { 972 throw new IllegalStateException("FSImageSaver has not saved an image"); 973 } 974 } 975 976 /** @throws IllegalStateException if the instance has already saved an image */ 977 private void checkNotSaved() { 978 if (saved) { 979 throw new IllegalStateException("FSImageSaver has already saved an image"); 980 } 981 } 982 983 984 Saver(SaveNamespaceContext context) { 985 this.context = context; 986 } 987 988 /** 989 * Return the MD5 checksum of the image file that was saved. 990 */ 991 MD5Hash getSavedDigest() { 992 checkSaved(); 993 return savedDigest; 994 } 995 996 void save(File newFile, FSImageCompression compression) throws IOException { 997 checkNotSaved(); 998 999 final FSNamesystem sourceNamesystem = context.getSourceNamesystem(); 1000 FSDirectory fsDir = sourceNamesystem.dir; 1001 String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath(); 1002 Step step = new Step(StepType.INODES, sdPath); 1003 StartupProgress prog = NameNode.getStartupProgress(); 1004 prog.beginStep(Phase.SAVING_CHECKPOINT, step); 1005 prog.setTotal(Phase.SAVING_CHECKPOINT, step, 1006 fsDir.rootDir.numItemsInTree()); 1007 Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step); 1008 long startTime = now(); 1009 // 1010 // Write out data 1011 // 1012 MessageDigest digester = MD5Hash.getDigester(); 1013 FileOutputStream fout = new FileOutputStream(newFile); 1014 DigestOutputStream fos = new DigestOutputStream(fout, digester); 1015 DataOutputStream out = new DataOutputStream(fos); 1016 try { 1017 out.writeInt(HdfsConstants.LAYOUT_VERSION); 1018 LayoutFlags.write(out); 1019 // We use the non-locked version of getNamespaceInfo here since 1020 // the coordinating thread of saveNamespace already has read-locked 1021 // the namespace for us. If we attempt to take another readlock 1022 // from the actual saver thread, there's a potential of a 1023 // fairness-related deadlock. See the comments on HDFS-2223. 1024 out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo() 1025 .getNamespaceID()); 1026 out.writeLong(fsDir.rootDir.numItemsInTree()); 1027 out.writeLong(sourceNamesystem.getGenerationStampV1()); 1028 out.writeLong(sourceNamesystem.getGenerationStampV2()); 1029 out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch()); 1030 out.writeLong(sourceNamesystem.getLastAllocatedBlockId()); 1031 out.writeLong(context.getTxId()); 1032 out.writeLong(sourceNamesystem.getLastInodeId()); 1033 1034 1035 sourceNamesystem.getSnapshotManager().write(out); 1036 1037 // write compression info and set up compressed stream 1038 out = compression.writeHeaderAndWrapStream(fos); 1039 LOG.info("Saving image file " + newFile + 1040 " using " + compression); 1041 1042 // save the root 1043 saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter); 1044 // save the rest of the nodes 1045 saveImage(fsDir.rootDir, out, true, false, counter); 1046 prog.endStep(Phase.SAVING_CHECKPOINT, step); 1047 // Now that the step is finished, set counter equal to total to adjust 1048 // for possible under-counting due to reference inodes. 1049 prog.setCount(Phase.SAVING_CHECKPOINT, step, 1050 fsDir.rootDir.numItemsInTree()); 1051 // save files under construction 1052 // TODO: for HDFS-5428, since we cannot break the compatibility of 1053 // fsimage, we store part of the under-construction files that are only 1054 // in snapshots in this "under-construction-file" section. As a 1055 // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their 1056 // paths, so that when loading fsimage we do not put them into the lease 1057 // map. In the future, we can remove this hack when we can bump the 1058 // layout version. 1059 sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap); 1060 1061 context.checkCancelled(); 1062 sourceNamesystem.saveSecretManagerState(out, sdPath); 1063 context.checkCancelled(); 1064 sourceNamesystem.getCacheManager().saveState(out, sdPath); 1065 context.checkCancelled(); 1066 out.flush(); 1067 context.checkCancelled(); 1068 fout.getChannel().force(true); 1069 } finally { 1070 out.close(); 1071 } 1072 1073 saved = true; 1074 // set md5 of the saved image 1075 savedDigest = new MD5Hash(digester.digest()); 1076 1077 LOG.info("Image file " + newFile + " of size " + newFile.length() + 1078 " bytes saved in " + (now() - startTime)/1000 + " seconds."); 1079 } 1080 1081 /** 1082 * Save children INodes. 1083 * @param children The list of children INodes 1084 * @param out The DataOutputStream to write 1085 * @param inSnapshot Whether the parent directory or its ancestor is in 1086 * the deleted list of some snapshot (caused by rename or 1087 * deletion) 1088 * @param counter Counter to increment for namenode startup progress 1089 * @return Number of children that are directory 1090 */ 1091 private int saveChildren(ReadOnlyList<INode> children, 1092 DataOutputStream out, boolean inSnapshot, Counter counter) 1093 throws IOException { 1094 // Write normal children INode. 1095 out.writeInt(children.size()); 1096 int dirNum = 0; 1097 int i = 0; 1098 for(INode child : children) { 1099 // print all children first 1100 // TODO: for HDFS-5428, we cannot change the format/content of fsimage 1101 // here, thus even if the parent directory is in snapshot, we still 1102 // do not handle INodeUC as those stored in deleted list 1103 saveINode2Image(child, out, false, referenceMap, counter); 1104 if (child.isDirectory()) { 1105 dirNum++; 1106 } else if (inSnapshot && child.isFile() 1107 && child.asFile().isUnderConstruction()) { 1108 this.snapshotUCMap.put(child.getId(), 1109 (INodeFileUnderConstruction) child.asFile()); 1110 } 1111 if (i++ % 50 == 0) { 1112 context.checkCancelled(); 1113 } 1114 } 1115 return dirNum; 1116 } 1117 1118 /** 1119 * Save file tree image starting from the given root. 1120 * This is a recursive procedure, which first saves all children and 1121 * snapshot diffs of a current directory and then moves inside the 1122 * sub-directories. 1123 * 1124 * @param current The current node 1125 * @param out The DataoutputStream to write the image 1126 * @param toSaveSubtree Whether or not to save the subtree to fsimage. For 1127 * reference node, its subtree may already have been 1128 * saved before. 1129 * @param inSnapshot Whether the current directory is in snapshot 1130 * @param counter Counter to increment for namenode startup progress 1131 */ 1132 private void saveImage(INodeDirectory current, DataOutputStream out, 1133 boolean toSaveSubtree, boolean inSnapshot, Counter counter) 1134 throws IOException { 1135 // write the inode id of the directory 1136 out.writeLong(current.getId()); 1137 1138 if (!toSaveSubtree) { 1139 return; 1140 } 1141 1142 final ReadOnlyList<INode> children = current.getChildrenList(null); 1143 int dirNum = 0; 1144 List<INodeDirectory> snapshotDirs = null; 1145 if (current instanceof INodeDirectoryWithSnapshot) { 1146 snapshotDirs = new ArrayList<INodeDirectory>(); 1147 ((INodeDirectoryWithSnapshot) current).getSnapshotDirectory( 1148 snapshotDirs); 1149 dirNum += snapshotDirs.size(); 1150 } 1151 1152 // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all 1153 // Snapshots 1154 if (current instanceof INodeDirectorySnapshottable) { 1155 INodeDirectorySnapshottable snapshottableNode = 1156 (INodeDirectorySnapshottable) current; 1157 SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out); 1158 } else { 1159 out.writeInt(-1); // # of snapshots 1160 } 1161 1162 // 3. Write children INode 1163 dirNum += saveChildren(children, out, inSnapshot, counter); 1164 1165 // 4. Write DirectoryDiff lists, if there is any. 1166 SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap); 1167 1168 // Write sub-tree of sub-directories, including possible snapshots of 1169 // deleted sub-directories 1170 out.writeInt(dirNum); // the number of sub-directories 1171 for(INode child : children) { 1172 if(!child.isDirectory()) { 1173 continue; 1174 } 1175 // make sure we only save the subtree under a reference node once 1176 boolean toSave = child.isReference() ? 1177 referenceMap.toProcessSubtree(child.getId()) : true; 1178 saveImage(child.asDirectory(), out, toSave, inSnapshot, counter); 1179 } 1180 if (snapshotDirs != null) { 1181 for (INodeDirectory subDir : snapshotDirs) { 1182 // make sure we only save the subtree under a reference node once 1183 boolean toSave = subDir.getParentReference() != null ? 1184 referenceMap.toProcessSubtree(subDir.getId()) : true; 1185 saveImage(subDir, out, toSave, true, counter); 1186 } 1187 } 1188 } 1189 1190 /** 1191 * Saves inode and increments progress counter. 1192 * 1193 * @param inode INode to save 1194 * @param out DataOutputStream to receive inode 1195 * @param writeUnderConstruction boolean true if this is under construction 1196 * @param referenceMap ReferenceMap containing reference inodes 1197 * @param counter Counter to increment for namenode startup progress 1198 * @throws IOException thrown if there is an I/O error 1199 */ 1200 private void saveINode2Image(INode inode, DataOutputStream out, 1201 boolean writeUnderConstruction, ReferenceMap referenceMap, 1202 Counter counter) throws IOException { 1203 FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction, 1204 referenceMap); 1205 // Intentionally do not increment counter for reference inodes, because it 1206 // is too difficult at this point to assess whether or not this is a 1207 // reference that counts toward quota. 1208 if (!(inode instanceof INodeReference)) { 1209 counter.increment(); 1210 } 1211 } 1212 } 1213}