001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.Time.now;
021
022import java.io.FilterInputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.util.Arrays;
026import java.util.EnumMap;
027import java.util.EnumSet;
028import java.util.List;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.classification.InterfaceStability;
034import org.apache.hadoop.fs.FileSystem;
035import org.apache.hadoop.fs.Path;
036import org.apache.hadoop.hdfs.protocol.Block;
037import org.apache.hadoop.hdfs.protocol.HdfsConstants;
038import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
039import org.apache.hadoop.hdfs.protocol.LayoutVersion;
040import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
041import org.apache.hadoop.hdfs.protocol.LocatedBlock;
042import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
043import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
044import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
045import org.apache.hadoop.hdfs.server.common.Storage;
046import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddBlockOp;
047import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCachePoolOp;
048import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
049import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCacheDirectiveInfoOp;
050import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AllocateBlockIdOp;
051import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AllowSnapshotOp;
052import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.BlockListUpdatingOp;
053import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp;
054import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
055import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp;
056import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CreateSnapshotOp;
057import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteOp;
058import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteSnapshotOp;
059import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DisallowSnapshotOp;
060import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.GetDelegationTokenOp;
061import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp;
062import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ModifyCachePoolOp;
063import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ModifyCacheDirectiveInfoOp;
064import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ReassignLeaseOp;
065import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RemoveCachePoolOp;
066import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RemoveCacheDirectiveInfoOp;
067import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOldOp;
068import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOp;
069import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameSnapshotOp;
070import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenewDelegationTokenOp;
071import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampV1Op;
072import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampV2Op;
073import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetNSQuotaOp;
074import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
075import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
076import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
077import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
078import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
079import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
080import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateBlocksOp;
081import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateMasterKeyOp;
082import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
083import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
084import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
085import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
086import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
087import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
088import org.apache.hadoop.hdfs.util.ChunkedArrayList;
089import org.apache.hadoop.hdfs.util.Holder;
090import org.apache.jasper.tagplugins.jstl.core.Remove;
091
092import com.google.common.base.Joiner;
093import com.google.common.base.Preconditions;
094
095@InterfaceAudience.Private
096@InterfaceStability.Evolving
097public class FSEditLogLoader {
098  static final Log LOG = LogFactory.getLog(FSEditLogLoader.class.getName());
099  static long REPLAY_TRANSACTION_LOG_INTERVAL = 1000; // 1sec
100  private final FSNamesystem fsNamesys;
101  private long lastAppliedTxId;
102  
103  public FSEditLogLoader(FSNamesystem fsNamesys, long lastAppliedTxId) {
104    this.fsNamesys = fsNamesys;
105    this.lastAppliedTxId = lastAppliedTxId;
106  }
107  
108  /**
109   * Load an edit log, and apply the changes to the in-memory structure
110   * This is where we apply edits that we've been writing to disk all
111   * along.
112   */
113  long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId,
114      MetaRecoveryContext recovery) throws IOException {
115    StartupProgress prog = NameNode.getStartupProgress();
116    Step step = createStartupProgressStep(edits);
117    prog.beginStep(Phase.LOADING_EDITS, step);
118    fsNamesys.writeLock();
119    try {
120      long startTime = now();
121      FSImage.LOG.info("Start loading edits file " + edits.getName());
122      long numEdits = loadEditRecords(edits, false, 
123                                 expectedStartingTxId, recovery);
124      FSImage.LOG.info("Edits file " + edits.getName() 
125          + " of size " + edits.length() + " edits # " + numEdits 
126          + " loaded in " + (now()-startTime)/1000 + " seconds");
127      return numEdits;
128    } finally {
129      edits.close();
130      fsNamesys.writeUnlock();
131      prog.endStep(Phase.LOADING_EDITS, step);
132    }
133  }
134
135  long loadEditRecords(EditLogInputStream in, boolean closeOnExit,
136                      long expectedStartingTxId, MetaRecoveryContext recovery)
137      throws IOException {
138    FSDirectory fsDir = fsNamesys.dir;
139
140    EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
141      new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
142
143    if (LOG.isTraceEnabled()) {
144      LOG.trace("Acquiring write lock to replay edit log");
145    }
146
147    fsNamesys.writeLock();
148    fsDir.writeLock();
149
150    long recentOpcodeOffsets[] = new long[4];
151    Arrays.fill(recentOpcodeOffsets, -1);
152    
153    long expectedTxId = expectedStartingTxId;
154    long numEdits = 0;
155    long lastTxId = in.getLastTxId();
156    long numTxns = (lastTxId - expectedStartingTxId) + 1;
157    StartupProgress prog = NameNode.getStartupProgress();
158    Step step = createStartupProgressStep(in);
159    prog.setTotal(Phase.LOADING_EDITS, step, numTxns);
160    Counter counter = prog.getCounter(Phase.LOADING_EDITS, step);
161    long lastLogTime = now();
162    long lastInodeId = fsNamesys.getLastInodeId();
163    
164    try {
165      while (true) {
166        try {
167          FSEditLogOp op;
168          try {
169            op = in.readOp();
170            if (op == null) {
171              break;
172            }
173          } catch (Throwable e) {
174            // Handle a problem with our input
175            check203UpgradeFailure(in.getVersion(), e);
176            String errorMessage =
177              formatEditLogReplayError(in, recentOpcodeOffsets, expectedTxId);
178            FSImage.LOG.error(errorMessage, e);
179            if (recovery == null) {
180               // We will only try to skip over problematic opcodes when in
181               // recovery mode.
182              throw new EditLogInputException(errorMessage, e, numEdits);
183            }
184            MetaRecoveryContext.editLogLoaderPrompt(
185                "We failed to read txId " + expectedTxId,
186                recovery, "skipping the bad section in the log");
187            in.resync();
188            continue;
189          }
190          recentOpcodeOffsets[(int)(numEdits % recentOpcodeOffsets.length)] =
191            in.getPosition();
192          if (op.hasTransactionId()) {
193            if (op.getTransactionId() > expectedTxId) { 
194              MetaRecoveryContext.editLogLoaderPrompt("There appears " +
195                  "to be a gap in the edit log.  We expected txid " +
196                  expectedTxId + ", but got txid " +
197                  op.getTransactionId() + ".", recovery, "ignoring missing " +
198                  " transaction IDs");
199            } else if (op.getTransactionId() < expectedTxId) { 
200              MetaRecoveryContext.editLogLoaderPrompt("There appears " +
201                  "to be an out-of-order edit in the edit log.  We " +
202                  "expected txid " + expectedTxId + ", but got txid " +
203                  op.getTransactionId() + ".", recovery,
204                  "skipping the out-of-order edit");
205              continue;
206            }
207          }
208          try {
209            long inodeId = applyEditLogOp(op, fsDir, in.getVersion(), lastInodeId);
210            if (lastInodeId < inodeId) {
211              lastInodeId = inodeId;
212            }
213          } catch (Throwable e) {
214            LOG.error("Encountered exception on operation " + op, e);
215            MetaRecoveryContext.editLogLoaderPrompt("Failed to " +
216             "apply edit log operation " + op + ": error " +
217             e.getMessage(), recovery, "applying edits");
218          }
219          // Now that the operation has been successfully decoded and
220          // applied, update our bookkeeping.
221          incrOpCount(op.opCode, opCounts, step, counter);
222          if (op.hasTransactionId()) {
223            lastAppliedTxId = op.getTransactionId();
224            expectedTxId = lastAppliedTxId + 1;
225          } else {
226            expectedTxId = lastAppliedTxId = expectedStartingTxId;
227          }
228          // log progress
229          if (op.hasTransactionId()) {
230            long now = now();
231            if (now - lastLogTime > REPLAY_TRANSACTION_LOG_INTERVAL) {
232              long deltaTxId = lastAppliedTxId - expectedStartingTxId + 1;
233              int percent = Math.round((float) deltaTxId / numTxns * 100);
234              LOG.info("replaying edit log: " + deltaTxId + "/" + numTxns
235                  + " transactions completed. (" + percent + "%)");
236              lastLogTime = now;
237            }
238          }
239          numEdits++;
240        } catch (MetaRecoveryContext.RequestStopException e) {
241          MetaRecoveryContext.LOG.warn("Stopped reading edit log at " +
242              in.getPosition() + "/"  + in.length());
243          break;
244        }
245      }
246    } finally {
247      fsNamesys.resetLastInodeId(lastInodeId);
248      if(closeOnExit) {
249        in.close();
250      }
251      fsDir.writeUnlock();
252      fsNamesys.writeUnlock();
253
254      if (LOG.isTraceEnabled()) {
255        LOG.trace("replaying edit log finished");
256      }
257
258      if (FSImage.LOG.isDebugEnabled()) {
259        dumpOpCounts(opCounts);
260      }
261    }
262    return numEdits;
263  }
264  
265  // allocate and update last allocated inode id
266  private long getAndUpdateLastInodeId(long inodeIdFromOp, int logVersion,
267      long lastInodeId) throws IOException {
268    long inodeId = inodeIdFromOp;
269
270    if (inodeId == INodeId.GRANDFATHER_INODE_ID) {
271      if (LayoutVersion.supports(Feature.ADD_INODE_ID, logVersion)) {
272        throw new IOException("The layout version " + logVersion
273            + " supports inodeId but gave bogus inodeId");
274      }
275      inodeId = fsNamesys.allocateNewInodeId();
276    } else {
277      // need to reset lastInodeId. fsnamesys gets lastInodeId firstly from
278      // fsimage but editlog captures more recent inodeId allocations
279      if (inodeId > lastInodeId) {
280        fsNamesys.resetLastInodeId(inodeId);
281      }
282    }
283    return inodeId;
284  }
285
286  @SuppressWarnings("deprecation")
287  private long applyEditLogOp(FSEditLogOp op, FSDirectory fsDir,
288      int logVersion, long lastInodeId) throws IOException {
289    long inodeId = INodeId.GRANDFATHER_INODE_ID;
290    if (LOG.isTraceEnabled()) {
291      LOG.trace("replaying edit log: " + op);
292    }
293    final boolean toAddRetryCache = fsNamesys.hasRetryCache() && op.hasRpcIds();
294    
295    switch (op.opCode) {
296    case OP_ADD: {
297      AddCloseOp addCloseOp = (AddCloseOp)op;
298      if (FSNamesystem.LOG.isDebugEnabled()) {
299        FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
300            " numblocks : " + addCloseOp.blocks.length +
301            " clientHolder " + addCloseOp.clientName +
302            " clientMachine " + addCloseOp.clientMachine);
303      }
304      // There three cases here:
305      // 1. OP_ADD to create a new file
306      // 2. OP_ADD to update file blocks
307      // 3. OP_ADD to open file for append
308
309      // See if the file already exists (persistBlocks call)
310      final INodesInPath iip = fsDir.getLastINodeInPath(addCloseOp.path);
311      final INodeFile oldFile = INodeFile.valueOf(
312          iip.getINode(0), addCloseOp.path, true);
313      INodeFile newFile = oldFile;
314      if (oldFile == null) { // this is OP_ADD on a new file (case 1)
315        // versions > 0 support per file replication
316        // get name and replication
317        final short replication = fsNamesys.getBlockManager()
318            .adjustReplication(addCloseOp.replication);
319        assert addCloseOp.blocks.length == 0;
320
321        // add to the file tree
322        inodeId = getAndUpdateLastInodeId(addCloseOp.inodeId, logVersion,
323            lastInodeId);
324        newFile = fsDir.unprotectedAddFile(inodeId,
325            addCloseOp.path, addCloseOp.permissions, replication,
326            addCloseOp.mtime, addCloseOp.atime, addCloseOp.blockSize, true,
327            addCloseOp.clientName, addCloseOp.clientMachine);
328        fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path);
329
330        // add the op into retry cache if necessary
331        if (toAddRetryCache) {
332          HdfsFileStatus stat = fsNamesys.dir.createFileStatus(
333              HdfsFileStatus.EMPTY_NAME, newFile, null);
334          fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
335              addCloseOp.rpcCallId, stat);
336        }
337      } else { // This is OP_ADD on an existing file
338        if (!oldFile.isUnderConstruction()) {
339          // This is case 3: a call to append() on an already-closed file.
340          if (FSNamesystem.LOG.isDebugEnabled()) {
341            FSNamesystem.LOG.debug("Reopening an already-closed file " +
342                "for append");
343          }
344          LocatedBlock lb = fsNamesys.prepareFileForWrite(addCloseOp.path,
345              oldFile, addCloseOp.clientName, addCloseOp.clientMachine, null,
346              false, iip.getLatestSnapshot(), false);
347          newFile = INodeFile.valueOf(fsDir.getINode(addCloseOp.path),
348              addCloseOp.path, true);
349          
350          // add the op into retry cache is necessary
351          if (toAddRetryCache) {
352            fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
353                addCloseOp.rpcCallId, lb);
354          }
355        }
356      }
357      // Fall-through for case 2.
358      // Regardless of whether it's a new file or an updated file,
359      // update the block list.
360      
361      // Update the salient file attributes.
362      newFile.setAccessTime(addCloseOp.atime, null, fsDir.getINodeMap());
363      newFile.setModificationTime(addCloseOp.mtime, null, fsDir.getINodeMap());
364      updateBlocks(fsDir, addCloseOp, newFile);
365      break;
366    }
367    case OP_CLOSE: {
368      AddCloseOp addCloseOp = (AddCloseOp)op;
369      
370      if (FSNamesystem.LOG.isDebugEnabled()) {
371        FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
372            " numblocks : " + addCloseOp.blocks.length +
373            " clientHolder " + addCloseOp.clientName +
374            " clientMachine " + addCloseOp.clientMachine);
375      }
376
377      final INodesInPath iip = fsDir.getLastINodeInPath(addCloseOp.path);
378      final INodeFile oldFile = INodeFile.valueOf(iip.getINode(0), addCloseOp.path);
379
380      // Update the salient file attributes.
381      oldFile.setAccessTime(addCloseOp.atime, null, fsDir.getINodeMap());
382      oldFile.setModificationTime(addCloseOp.mtime, null, fsDir.getINodeMap());
383      updateBlocks(fsDir, addCloseOp, oldFile);
384
385      // Now close the file
386      if (!oldFile.isUnderConstruction() &&
387          logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) {
388        // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE
389        // could show up twice in a row. But after that version, this
390        // should be fixed, so we should treat it as an error.
391        throw new IOException(
392            "File is not under construction: " + addCloseOp.path);
393      }
394      // One might expect that you could use removeLease(holder, path) here,
395      // but OP_CLOSE doesn't serialize the holder. So, remove by path.
396      if (oldFile.isUnderConstruction()) {
397        INodeFileUnderConstruction ucFile = (INodeFileUnderConstruction) oldFile;
398        fsNamesys.leaseManager.removeLeaseWithPrefixPath(addCloseOp.path);
399        INodeFile newFile = ucFile.toINodeFile(ucFile.getModificationTime());
400        fsDir.unprotectedReplaceINodeFile(addCloseOp.path, ucFile, newFile);
401      }
402      break;
403    }
404    case OP_UPDATE_BLOCKS: {
405      UpdateBlocksOp updateOp = (UpdateBlocksOp)op;
406      if (FSNamesystem.LOG.isDebugEnabled()) {
407        FSNamesystem.LOG.debug(op.opCode + ": " + updateOp.path +
408            " numblocks : " + updateOp.blocks.length);
409      }
410      INodeFile oldFile = INodeFile.valueOf(fsDir.getINode(updateOp.path),
411          updateOp.path);
412      // Update in-memory data structures
413      updateBlocks(fsDir, updateOp, oldFile);
414      
415      if (toAddRetryCache) {
416        fsNamesys.addCacheEntry(updateOp.rpcClientId, updateOp.rpcCallId);
417      }
418      break;
419    }
420    case OP_ADD_BLOCK: {
421      AddBlockOp addBlockOp = (AddBlockOp) op;
422      String path = addBlockOp.getPath();
423      if (FSNamesystem.LOG.isDebugEnabled()) {
424        FSNamesystem.LOG.debug(op.opCode + ": " + path +
425            " new block id : " + addBlockOp.getLastBlock().getBlockId());
426      }
427      INodeFile oldFile = INodeFile.valueOf(fsDir.getINode(path), path);
428      // add the new block to the INodeFile
429      addNewBlock(fsDir, addBlockOp, oldFile);
430      break;
431    }
432    case OP_SET_REPLICATION: {
433      SetReplicationOp setReplicationOp = (SetReplicationOp)op;
434      short replication = fsNamesys.getBlockManager().adjustReplication(
435          setReplicationOp.replication);
436      fsDir.unprotectedSetReplication(setReplicationOp.path,
437                                      replication, null);
438      break;
439    }
440    case OP_CONCAT_DELETE: {
441      ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
442      fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
443          concatDeleteOp.timestamp);
444      
445      if (toAddRetryCache) {
446        fsNamesys.addCacheEntry(concatDeleteOp.rpcClientId,
447            concatDeleteOp.rpcCallId);
448      }
449      break;
450    }
451    case OP_RENAME_OLD: {
452      RenameOldOp renameOp = (RenameOldOp)op;
453      fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
454                                renameOp.timestamp);
455      
456      if (toAddRetryCache) {
457        fsNamesys.addCacheEntry(renameOp.rpcClientId, renameOp.rpcCallId);
458      }
459      break;
460    }
461    case OP_DELETE: {
462      DeleteOp deleteOp = (DeleteOp)op;
463      fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
464      
465      if (toAddRetryCache) {
466        fsNamesys.addCacheEntry(deleteOp.rpcClientId, deleteOp.rpcCallId);
467      }
468      break;
469    }
470    case OP_MKDIR: {
471      MkdirOp mkdirOp = (MkdirOp)op;
472      inodeId = getAndUpdateLastInodeId(mkdirOp.inodeId, logVersion,
473          lastInodeId);
474      fsDir.unprotectedMkdir(inodeId, mkdirOp.path, mkdirOp.permissions,
475                             mkdirOp.timestamp);
476      break;
477    }
478    case OP_SET_GENSTAMP_V1: {
479      SetGenstampV1Op setGenstampV1Op = (SetGenstampV1Op)op;
480      fsNamesys.setGenerationStampV1(setGenstampV1Op.genStampV1);
481      break;
482    }
483    case OP_SET_PERMISSIONS: {
484      SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
485      fsDir.unprotectedSetPermission(setPermissionsOp.src,
486                                     setPermissionsOp.permissions);
487      break;
488    }
489    case OP_SET_OWNER: {
490      SetOwnerOp setOwnerOp = (SetOwnerOp)op;
491      fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
492                                setOwnerOp.groupname);
493      break;
494    }
495    case OP_SET_NS_QUOTA: {
496      SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
497      fsDir.unprotectedSetQuota(setNSQuotaOp.src,
498                                setNSQuotaOp.nsQuota,
499                                HdfsConstants.QUOTA_DONT_SET);
500      break;
501    }
502    case OP_CLEAR_NS_QUOTA: {
503      ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
504      fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
505                                HdfsConstants.QUOTA_RESET,
506                                HdfsConstants.QUOTA_DONT_SET);
507      break;
508    }
509
510    case OP_SET_QUOTA:
511      SetQuotaOp setQuotaOp = (SetQuotaOp)op;
512      fsDir.unprotectedSetQuota(setQuotaOp.src,
513                                setQuotaOp.nsQuota,
514                                setQuotaOp.dsQuota);
515      break;
516
517    case OP_TIMES: {
518      TimesOp timesOp = (TimesOp)op;
519
520      fsDir.unprotectedSetTimes(timesOp.path,
521                                timesOp.mtime,
522                                timesOp.atime, true);
523      break;
524    }
525    case OP_SYMLINK: {
526      if (!FileSystem.areSymlinksEnabled()) {
527        throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
528      }
529      SymlinkOp symlinkOp = (SymlinkOp)op;
530      inodeId = getAndUpdateLastInodeId(symlinkOp.inodeId, logVersion,
531          lastInodeId);
532      fsDir.unprotectedAddSymlink(inodeId, symlinkOp.path,
533                                  symlinkOp.value, symlinkOp.mtime, 
534                                  symlinkOp.atime, symlinkOp.permissionStatus);
535      
536      if (toAddRetryCache) {
537        fsNamesys.addCacheEntry(symlinkOp.rpcClientId, symlinkOp.rpcCallId);
538      }
539      break;
540    }
541    case OP_RENAME: {
542      RenameOp renameOp = (RenameOp)op;
543      fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
544                                renameOp.timestamp, renameOp.options);
545      
546      if (toAddRetryCache) {
547        fsNamesys.addCacheEntry(renameOp.rpcClientId, renameOp.rpcCallId);
548      }
549      break;
550    }
551    case OP_GET_DELEGATION_TOKEN: {
552      GetDelegationTokenOp getDelegationTokenOp
553        = (GetDelegationTokenOp)op;
554
555      fsNamesys.getDelegationTokenSecretManager()
556        .addPersistedDelegationToken(getDelegationTokenOp.token,
557                                     getDelegationTokenOp.expiryTime);
558      break;
559    }
560    case OP_RENEW_DELEGATION_TOKEN: {
561      RenewDelegationTokenOp renewDelegationTokenOp
562        = (RenewDelegationTokenOp)op;
563      fsNamesys.getDelegationTokenSecretManager()
564        .updatePersistedTokenRenewal(renewDelegationTokenOp.token,
565                                     renewDelegationTokenOp.expiryTime);
566      break;
567    }
568    case OP_CANCEL_DELEGATION_TOKEN: {
569      CancelDelegationTokenOp cancelDelegationTokenOp
570        = (CancelDelegationTokenOp)op;
571      fsNamesys.getDelegationTokenSecretManager()
572          .updatePersistedTokenCancellation(
573              cancelDelegationTokenOp.token);
574      break;
575    }
576    case OP_UPDATE_MASTER_KEY: {
577      UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op;
578      fsNamesys.getDelegationTokenSecretManager()
579        .updatePersistedMasterKey(updateMasterKeyOp.key);
580      break;
581    }
582    case OP_REASSIGN_LEASE: {
583      ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op;
584
585      Lease lease = fsNamesys.leaseManager.getLease(
586          reassignLeaseOp.leaseHolder);
587      INodeFileUnderConstruction pendingFile =
588          INodeFileUnderConstruction.valueOf( 
589              fsDir.getINode(reassignLeaseOp.path), reassignLeaseOp.path);
590      fsNamesys.reassignLeaseInternal(lease,
591          reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
592      break;
593    }
594    case OP_START_LOG_SEGMENT:
595    case OP_END_LOG_SEGMENT: {
596      // no data in here currently.
597      break;
598    }
599    case OP_CREATE_SNAPSHOT: {
600      CreateSnapshotOp createSnapshotOp = (CreateSnapshotOp) op;
601      String path = fsNamesys.getSnapshotManager().createSnapshot(
602          createSnapshotOp.snapshotRoot, createSnapshotOp.snapshotName);
603      if (toAddRetryCache) {
604        fsNamesys.addCacheEntryWithPayload(createSnapshotOp.rpcClientId,
605            createSnapshotOp.rpcCallId, path);
606      }
607      break;
608    }
609    case OP_DELETE_SNAPSHOT: {
610      DeleteSnapshotOp deleteSnapshotOp = (DeleteSnapshotOp) op;
611      BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
612      List<INode> removedINodes = new ChunkedArrayList<INode>();
613      fsNamesys.getSnapshotManager().deleteSnapshot(
614          deleteSnapshotOp.snapshotRoot, deleteSnapshotOp.snapshotName,
615          collectedBlocks, removedINodes);
616      fsNamesys.removeBlocksAndUpdateSafemodeTotal(collectedBlocks);
617      collectedBlocks.clear();
618      fsNamesys.dir.removeFromInodeMap(removedINodes);
619      removedINodes.clear();
620      
621      if (toAddRetryCache) {
622        fsNamesys.addCacheEntry(deleteSnapshotOp.rpcClientId,
623            deleteSnapshotOp.rpcCallId);
624      }
625      break;
626    }
627    case OP_RENAME_SNAPSHOT: {
628      RenameSnapshotOp renameSnapshotOp = (RenameSnapshotOp) op;
629      fsNamesys.getSnapshotManager().renameSnapshot(
630          renameSnapshotOp.snapshotRoot, renameSnapshotOp.snapshotOldName,
631          renameSnapshotOp.snapshotNewName);
632      
633      if (toAddRetryCache) {
634        fsNamesys.addCacheEntry(renameSnapshotOp.rpcClientId,
635            renameSnapshotOp.rpcCallId);
636      }
637      break;
638    }
639    case OP_ALLOW_SNAPSHOT: {
640      AllowSnapshotOp allowSnapshotOp = (AllowSnapshotOp) op;
641      fsNamesys.getSnapshotManager().setSnapshottable(
642          allowSnapshotOp.snapshotRoot, false);
643      break;
644    }
645    case OP_DISALLOW_SNAPSHOT: {
646      DisallowSnapshotOp disallowSnapshotOp = (DisallowSnapshotOp) op;
647      fsNamesys.getSnapshotManager().resetSnapshottable(
648          disallowSnapshotOp.snapshotRoot);
649      break;
650    }
651    case OP_SET_GENSTAMP_V2: {
652      SetGenstampV2Op setGenstampV2Op = (SetGenstampV2Op) op;
653      fsNamesys.setGenerationStampV2(setGenstampV2Op.genStampV2);
654      break;
655    }
656    case OP_ALLOCATE_BLOCK_ID: {
657      AllocateBlockIdOp allocateBlockIdOp = (AllocateBlockIdOp) op;
658      fsNamesys.setLastAllocatedBlockId(allocateBlockIdOp.blockId);
659      break;
660    }
661    case OP_ADD_CACHE_DIRECTIVE: {
662      AddCacheDirectiveInfoOp addOp = (AddCacheDirectiveInfoOp) op;
663      CacheDirectiveInfo result = fsNamesys.
664          getCacheManager().addDirectiveFromEditLog(addOp.directive);
665      if (toAddRetryCache) {
666        Long id = result.getId();
667        fsNamesys.addCacheEntryWithPayload(op.rpcClientId, op.rpcCallId, id);
668      }
669      break;
670    }
671    case OP_MODIFY_CACHE_DIRECTIVE: {
672      ModifyCacheDirectiveInfoOp modifyOp =
673          (ModifyCacheDirectiveInfoOp) op;
674      fsNamesys.getCacheManager().modifyDirectiveFromEditLog(
675          modifyOp.directive);
676      if (toAddRetryCache) {
677        fsNamesys.addCacheEntry(op.rpcClientId, op.rpcCallId);
678      }
679      break;
680    }
681    case OP_REMOVE_CACHE_DIRECTIVE: {
682      RemoveCacheDirectiveInfoOp removeOp =
683          (RemoveCacheDirectiveInfoOp) op;
684      fsNamesys.getCacheManager().removeDirective(removeOp.id, null);
685      if (toAddRetryCache) {
686        fsNamesys.addCacheEntry(op.rpcClientId, op.rpcCallId);
687      }
688      break;
689    }
690    case OP_ADD_CACHE_POOL: {
691      AddCachePoolOp addOp = (AddCachePoolOp) op;
692      fsNamesys.getCacheManager().addCachePool(addOp.info);
693      if (toAddRetryCache) {
694        fsNamesys.addCacheEntry(op.rpcClientId, op.rpcCallId);
695      }
696      break;
697    }
698    case OP_MODIFY_CACHE_POOL: {
699      ModifyCachePoolOp modifyOp = (ModifyCachePoolOp) op;
700      fsNamesys.getCacheManager().modifyCachePool(modifyOp.info);
701      if (toAddRetryCache) {
702        fsNamesys.addCacheEntry(op.rpcClientId, op.rpcCallId);
703      }
704      break;
705    }
706    case OP_REMOVE_CACHE_POOL: {
707      RemoveCachePoolOp removeOp = (RemoveCachePoolOp) op;
708      fsNamesys.getCacheManager().removeCachePool(removeOp.poolName);
709      if (toAddRetryCache) {
710        fsNamesys.addCacheEntry(op.rpcClientId, op.rpcCallId);
711      }
712      break;
713    }
714    default:
715      throw new IOException("Invalid operation read " + op.opCode);
716    }
717    return inodeId;
718  }
719  
720  private static String formatEditLogReplayError(EditLogInputStream in,
721      long recentOpcodeOffsets[], long txid) {
722    StringBuilder sb = new StringBuilder();
723    sb.append("Error replaying edit log at offset " + in.getPosition());
724    sb.append(".  Expected transaction ID was ").append(txid);
725    if (recentOpcodeOffsets[0] != -1) {
726      Arrays.sort(recentOpcodeOffsets);
727      sb.append("\nRecent opcode offsets:");
728      for (long offset : recentOpcodeOffsets) {
729        if (offset != -1) {
730          sb.append(' ').append(offset);
731        }
732      }
733    }
734    return sb.toString();
735  }
736
737  /**
738   * Add a new block into the given INodeFile
739   */
740  private void addNewBlock(FSDirectory fsDir, AddBlockOp op, INodeFile file)
741      throws IOException {
742    BlockInfo[] oldBlocks = file.getBlocks();
743    Block pBlock = op.getPenultimateBlock();
744    Block newBlock= op.getLastBlock();
745    
746    if (pBlock != null) { // the penultimate block is not null
747      Preconditions.checkState(oldBlocks != null && oldBlocks.length > 0);
748      // compare pBlock with the last block of oldBlocks
749      Block oldLastBlock = oldBlocks[oldBlocks.length - 1];
750      if (oldLastBlock.getBlockId() != pBlock.getBlockId()
751          || oldLastBlock.getGenerationStamp() != pBlock.getGenerationStamp()) {
752        throw new IOException(
753            "Mismatched block IDs or generation stamps for the old last block of file "
754                + op.getPath() + ", the old last block is " + oldLastBlock
755                + ", and the block read from editlog is " + pBlock);
756      }
757      
758      oldLastBlock.setNumBytes(pBlock.getNumBytes());
759      if (oldLastBlock instanceof BlockInfoUnderConstruction) {
760        fsNamesys.getBlockManager().forceCompleteBlock(
761            (INodeFileUnderConstruction) file,
762            (BlockInfoUnderConstruction) oldLastBlock);
763        fsNamesys.getBlockManager().processQueuedMessagesForBlock(pBlock);
764      }
765    } else { // the penultimate block is null
766      Preconditions.checkState(oldBlocks == null || oldBlocks.length == 0);
767    }
768    // add the new block
769    BlockInfo newBI = new BlockInfoUnderConstruction(
770          newBlock, file.getBlockReplication());
771    fsNamesys.getBlockManager().addBlockCollection(newBI, file);
772    file.addBlock(newBI);
773    fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
774  }
775  
776  /**
777   * Update in-memory data structures with new block information.
778   * @throws IOException
779   */
780  private void updateBlocks(FSDirectory fsDir, BlockListUpdatingOp op,
781      INodeFile file) throws IOException {
782    // Update its block list
783    BlockInfo[] oldBlocks = file.getBlocks();
784    Block[] newBlocks = op.getBlocks();
785    String path = op.getPath();
786    
787    // Are we only updating the last block's gen stamp.
788    boolean isGenStampUpdate = oldBlocks.length == newBlocks.length;
789    
790    // First, update blocks in common
791    for (int i = 0; i < oldBlocks.length && i < newBlocks.length; i++) {
792      BlockInfo oldBlock = oldBlocks[i];
793      Block newBlock = newBlocks[i];
794      
795      boolean isLastBlock = i == newBlocks.length - 1;
796      if (oldBlock.getBlockId() != newBlock.getBlockId() ||
797          (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() && 
798              !(isGenStampUpdate && isLastBlock))) {
799        throw new IOException("Mismatched block IDs or generation stamps, " +
800            "attempting to replace block " + oldBlock + " with " + newBlock +
801            " as block # " + i + "/" + newBlocks.length + " of " +
802            path);
803      }
804      
805      oldBlock.setNumBytes(newBlock.getNumBytes());
806      boolean changeMade =
807        oldBlock.getGenerationStamp() != newBlock.getGenerationStamp();
808      oldBlock.setGenerationStamp(newBlock.getGenerationStamp());
809      
810      if (oldBlock instanceof BlockInfoUnderConstruction &&
811          (!isLastBlock || op.shouldCompleteLastBlock())) {
812        changeMade = true;
813        fsNamesys.getBlockManager().forceCompleteBlock(
814            (INodeFileUnderConstruction)file,
815            (BlockInfoUnderConstruction)oldBlock);
816      }
817      if (changeMade) {
818        // The state or gen-stamp of the block has changed. So, we may be
819        // able to process some messages from datanodes that we previously
820        // were unable to process.
821        fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
822      }
823    }
824    
825    if (newBlocks.length < oldBlocks.length) {
826      // We're removing a block from the file, e.g. abandonBlock(...)
827      if (!file.isUnderConstruction()) {
828        throw new IOException("Trying to remove a block from file " +
829            path + " which is not under construction.");
830      }
831      if (newBlocks.length != oldBlocks.length - 1) {
832        throw new IOException("Trying to remove more than one block from file "
833            + path);
834      }
835      Block oldBlock = oldBlocks[oldBlocks.length - 1];
836      boolean removed = fsDir.unprotectedRemoveBlock(path,
837          (INodeFileUnderConstruction) file, oldBlock);
838      if (!removed && !(op instanceof UpdateBlocksOp)) {
839        throw new IOException("Trying to delete non-existant block " + oldBlock);
840      }
841    } else if (newBlocks.length > oldBlocks.length) {
842      // We're adding blocks
843      for (int i = oldBlocks.length; i < newBlocks.length; i++) {
844        Block newBlock = newBlocks[i];
845        BlockInfo newBI;
846        if (!op.shouldCompleteLastBlock()) {
847          // TODO: shouldn't this only be true for the last block?
848          // what about an old-version fsync() where fsync isn't called
849          // until several blocks in?
850          newBI = new BlockInfoUnderConstruction(
851              newBlock, file.getBlockReplication());
852        } else {
853          // OP_CLOSE should add finalized blocks. This code path
854          // is only executed when loading edits written by prior
855          // versions of Hadoop. Current versions always log
856          // OP_ADD operations as each block is allocated.
857          newBI = new BlockInfo(newBlock, file.getBlockReplication());
858        }
859        fsNamesys.getBlockManager().addBlockCollection(newBI, file);
860        file.addBlock(newBI);
861        fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
862      }
863    }
864  }
865
866  private static void dumpOpCounts(
867      EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
868    StringBuilder sb = new StringBuilder();
869    sb.append("Summary of operations loaded from edit log:\n  ");
870    Joiner.on("\n  ").withKeyValueSeparator("=").appendTo(sb, opCounts);
871    FSImage.LOG.debug(sb.toString());
872  }
873
874  private void incrOpCount(FSEditLogOpCodes opCode,
875      EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts, Step step,
876      Counter counter) {
877    Holder<Integer> holder = opCounts.get(opCode);
878    if (holder == null) {
879      holder = new Holder<Integer>(1);
880      opCounts.put(opCode, holder);
881    } else {
882      holder.held++;
883    }
884    counter.increment();
885  }
886
887  /**
888   * Throw appropriate exception during upgrade from 203, when editlog loading
889   * could fail due to opcode conflicts.
890   */
891  private void check203UpgradeFailure(int logVersion, Throwable e)
892      throws IOException {
893    // 0.20.203 version version has conflicting opcodes with the later releases.
894    // The editlog must be emptied by restarting the namenode, before proceeding
895    // with the upgrade.
896    if (Storage.is203LayoutVersion(logVersion)
897        && logVersion != HdfsConstants.LAYOUT_VERSION) {
898      String msg = "During upgrade failed to load the editlog version "
899          + logVersion + " from release 0.20.203. Please go back to the old "
900          + " release and restart the namenode. This empties the editlog "
901          + " and saves the namespace. Resume the upgrade after this step.";
902      throw new IOException(msg, e);
903    }
904  }
905  
906  /**
907   * Find the last valid transaction ID in the stream.
908   * If there are invalid or corrupt transactions in the middle of the stream,
909   * validateEditLog will skip over them.
910   * This reads through the stream but does not close it.
911   *
912   * @throws IOException if the stream cannot be read due to an IO error (eg
913   *                     if the log does not exist)
914   */
915  static EditLogValidation validateEditLog(EditLogInputStream in) {
916    long lastPos = 0;
917    long lastTxId = HdfsConstants.INVALID_TXID;
918    long numValid = 0;
919    FSEditLogOp op = null;
920    while (true) {
921      lastPos = in.getPosition();
922      try {
923        if ((op = in.readOp()) == null) {
924          break;
925        }
926      } catch (Throwable t) {
927        FSImage.LOG.warn("Caught exception after reading " + numValid +
928            " ops from " + in + " while determining its valid length." +
929            "Position was " + lastPos, t);
930        in.resync();
931        FSImage.LOG.warn("After resync, position is " + in.getPosition());
932        continue;
933      }
934      if (lastTxId == HdfsConstants.INVALID_TXID
935          || op.getTransactionId() > lastTxId) {
936        lastTxId = op.getTransactionId();
937      }
938      numValid++;
939    }
940    return new EditLogValidation(lastPos, lastTxId, false);
941  }
942
943  static class EditLogValidation {
944    private final long validLength;
945    private final long endTxId;
946    private final boolean hasCorruptHeader;
947
948    EditLogValidation(long validLength, long endTxId,
949        boolean hasCorruptHeader) {
950      this.validLength = validLength;
951      this.endTxId = endTxId;
952      this.hasCorruptHeader = hasCorruptHeader;
953    }
954
955    long getValidLength() { return validLength; }
956
957    long getEndTxId() { return endTxId; }
958
959    boolean hasCorruptHeader() { return hasCorruptHeader; }
960  }
961
962  /**
963   * Stream wrapper that keeps track of the current stream position.
964   * 
965   * This stream also allows us to set a limit on how many bytes we can read
966   * without getting an exception.
967   */
968  public static class PositionTrackingInputStream extends FilterInputStream
969      implements StreamLimiter {
970    private long curPos = 0;
971    private long markPos = -1;
972    private long limitPos = Long.MAX_VALUE;
973
974    public PositionTrackingInputStream(InputStream is) {
975      super(is);
976    }
977
978    private void checkLimit(long amt) throws IOException {
979      long extra = (curPos + amt) - limitPos;
980      if (extra > 0) {
981        throw new IOException("Tried to read " + amt + " byte(s) past " +
982            "the limit at offset " + limitPos);
983      }
984    }
985    
986    @Override
987    public int read() throws IOException {
988      checkLimit(1);
989      int ret = super.read();
990      if (ret != -1) curPos++;
991      return ret;
992    }
993
994    @Override
995    public int read(byte[] data) throws IOException {
996      checkLimit(data.length);
997      int ret = super.read(data);
998      if (ret > 0) curPos += ret;
999      return ret;
1000    }
1001
1002    @Override
1003    public int read(byte[] data, int offset, int length) throws IOException {
1004      checkLimit(length);
1005      int ret = super.read(data, offset, length);
1006      if (ret > 0) curPos += ret;
1007      return ret;
1008    }
1009
1010    @Override
1011    public void setLimit(long limit) {
1012      limitPos = curPos + limit;
1013    }
1014
1015    @Override
1016    public void clearLimit() {
1017      limitPos = Long.MAX_VALUE;
1018    }
1019
1020    @Override
1021    public void mark(int limit) {
1022      super.mark(limit);
1023      markPos = curPos;
1024    }
1025
1026    @Override
1027    public void reset() throws IOException {
1028      if (markPos == -1) {
1029        throw new IOException("Not marked!");
1030      }
1031      super.reset();
1032      curPos = markPos;
1033      markPos = -1;
1034    }
1035
1036    public long getPos() {
1037      return curPos;
1038    }
1039    
1040    @Override
1041    public long skip(long amt) throws IOException {
1042      long extra = (curPos + amt) - limitPos;
1043      if (extra > 0) {
1044        throw new IOException("Tried to skip " + extra + " bytes past " +
1045            "the limit at offset " + limitPos);
1046      }
1047      long ret = super.skip(amt);
1048      curPos += ret;
1049      return ret;
1050    }
1051  }
1052
1053  public long getLastAppliedTxId() {
1054    return lastAppliedTxId;
1055  }
1056
1057  /**
1058   * Creates a Step used for updating startup progress, populated with
1059   * information from the given edits.  The step always includes the log's name.
1060   * If the log has a known length, then the length is included in the step too.
1061   * 
1062   * @param edits EditLogInputStream to use for populating step
1063   * @return Step populated with information from edits
1064   * @throws IOException thrown if there is an I/O error
1065   */
1066  private static Step createStartupProgressStep(EditLogInputStream edits)
1067      throws IOException {
1068    long length = edits.length();
1069    String name = edits.getCurrentStreamName();
1070    return length != -1 ? new Step(name, length) : new Step(name);
1071  }
1072}