001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.FileInputStream;
021import java.io.IOException;
022import java.net.InetSocketAddress;
023import java.net.Socket;
024import java.nio.ByteBuffer;
025import java.util.AbstractMap;
026import java.util.ArrayList;
027import java.util.EnumSet;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.Iterator;
031import java.util.List;
032import java.util.Map;
033import java.util.Map.Entry;
034import java.util.Set;
035import java.util.concurrent.ConcurrentHashMap;
036
037import org.apache.commons.io.IOUtils;
038import org.apache.hadoop.classification.InterfaceAudience;
039import org.apache.hadoop.fs.ByteBufferReadable;
040import org.apache.hadoop.fs.ByteBufferUtil;
041import org.apache.hadoop.fs.CanSetDropBehind;
042import org.apache.hadoop.fs.CanSetReadahead;
043import org.apache.hadoop.fs.ChecksumException;
044import org.apache.hadoop.fs.FSInputStream;
045import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
046import org.apache.hadoop.fs.ReadOption;
047import org.apache.hadoop.fs.UnresolvedLinkException;
048import org.apache.hadoop.hdfs.client.ClientMmap;
049import org.apache.hadoop.hdfs.net.DomainPeer;
050import org.apache.hadoop.hdfs.net.Peer;
051import org.apache.hadoop.hdfs.net.TcpPeerServer;
052import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
053import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
054import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
055import org.apache.hadoop.hdfs.protocol.LocatedBlock;
056import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
057import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
058import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
059import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
060import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
061import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
062import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
063import org.apache.hadoop.io.ByteBufferPool;
064import org.apache.hadoop.ipc.RPC;
065import org.apache.hadoop.ipc.RemoteException;
066import org.apache.hadoop.net.NetUtils;
067import org.apache.hadoop.net.unix.DomainSocket;
068import org.apache.hadoop.security.AccessControlException;
069import org.apache.hadoop.security.token.SecretManager.InvalidToken;
070import org.apache.hadoop.security.token.Token;
071import org.apache.hadoop.util.IdentityHashStore;
072
073import com.google.common.annotations.VisibleForTesting;
074
075/****************************************************************
076 * DFSInputStream provides bytes from a named file.  It handles 
077 * negotiation of the namenode and various datanodes as necessary.
078 ****************************************************************/
079@InterfaceAudience.Private
080public class DFSInputStream extends FSInputStream
081implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
082    HasEnhancedByteBufferAccess {
083  @VisibleForTesting
084  static boolean tcpReadsDisabledForTesting = false;
085  private final PeerCache peerCache;
086  private final DFSClient dfsClient;
087  private boolean closed = false;
088  private final String src;
089  private BlockReader blockReader = null;
090  private final boolean verifyChecksum;
091  private LocatedBlocks locatedBlocks = null;
092  private long lastBlockBeingWrittenLength = 0;
093  private DatanodeInfo currentNode = null;
094  private LocatedBlock currentLocatedBlock = null;
095  private long pos = 0;
096  private long blockEnd = -1;
097  private CachingStrategy cachingStrategy;
098  private final ReadStatistics readStatistics = new ReadStatistics();
099
100  /**
101   * Track the ByteBuffers that we have handed out to readers.
102   * 
103   * The value type can be either ByteBufferPool or ClientMmap, depending on
104   * whether we this is a memory-mapped buffer or not.
105   */
106  private final IdentityHashStore<ByteBuffer, Object>
107      extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
108
109  public static class ReadStatistics {
110    public ReadStatistics() {
111      this.totalBytesRead = 0;
112      this.totalLocalBytesRead = 0;
113      this.totalShortCircuitBytesRead = 0;
114      this.totalZeroCopyBytesRead = 0;
115    }
116
117    public ReadStatistics(ReadStatistics rhs) {
118      this.totalBytesRead = rhs.getTotalBytesRead();
119      this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
120      this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
121      this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
122    }
123
124    /**
125     * @return The total bytes read.  This will always be at least as
126     * high as the other numbers, since it includes all of them.
127     */
128    public long getTotalBytesRead() {
129      return totalBytesRead;
130    }
131
132    /**
133     * @return The total local bytes read.  This will always be at least
134     * as high as totalShortCircuitBytesRead, since all short-circuit
135     * reads are also local.
136     */
137    public long getTotalLocalBytesRead() {
138      return totalLocalBytesRead;
139    }
140
141    /**
142     * @return The total short-circuit local bytes read.
143     */
144    public long getTotalShortCircuitBytesRead() {
145      return totalShortCircuitBytesRead;
146    }
147    
148    /**
149     * @return The total number of zero-copy bytes read.
150     */
151    public long getTotalZeroCopyBytesRead() {
152      return totalZeroCopyBytesRead;
153    }
154
155    /**
156     * @return The total number of bytes read which were not local.
157     */
158    public long getRemoteBytesRead() {
159      return totalBytesRead - totalLocalBytesRead;
160    }
161    
162    void addRemoteBytes(long amt) {
163      this.totalBytesRead += amt;
164    }
165
166    void addLocalBytes(long amt) {
167      this.totalBytesRead += amt;
168      this.totalLocalBytesRead += amt;
169    }
170
171    void addShortCircuitBytes(long amt) {
172      this.totalBytesRead += amt;
173      this.totalLocalBytesRead += amt;
174      this.totalShortCircuitBytesRead += amt;
175    }
176
177    void addZeroCopyBytes(long amt) {
178      this.totalBytesRead += amt;
179      this.totalLocalBytesRead += amt;
180      this.totalShortCircuitBytesRead += amt;
181      this.totalZeroCopyBytesRead += amt;
182    }
183    
184    private long totalBytesRead;
185
186    private long totalLocalBytesRead;
187
188    private long totalShortCircuitBytesRead;
189
190    private long totalZeroCopyBytesRead;
191  }
192  
193  private final FileInputStreamCache fileInputStreamCache;
194
195  /**
196   * This variable tracks the number of failures since the start of the
197   * most recent user-facing operation. That is to say, it should be reset
198   * whenever the user makes a call on this stream, and if at any point
199   * during the retry logic, the failure count exceeds a threshold,
200   * the errors will be thrown back to the operation.
201   *
202   * Specifically this counts the number of times the client has gone
203   * back to the namenode to get a new list of block locations, and is
204   * capped at maxBlockAcquireFailures
205   */
206  private int failures = 0;
207
208  /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
209   * parallel accesses to DFSInputStream (through ptreads) properly */
210  private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
211             new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
212  private int buffersize = 1;
213  
214  private final byte[] oneByteBuf = new byte[1]; // used for 'int read()'
215
216  void addToDeadNodes(DatanodeInfo dnInfo) {
217    deadNodes.put(dnInfo, dnInfo);
218  }
219  
220  DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum
221                 ) throws IOException, UnresolvedLinkException {
222    this.dfsClient = dfsClient;
223    this.verifyChecksum = verifyChecksum;
224    this.buffersize = buffersize;
225    this.src = src;
226    this.peerCache = dfsClient.peerCache;
227    this.fileInputStreamCache = new FileInputStreamCache(
228        dfsClient.getConf().shortCircuitStreamsCacheSize,
229        dfsClient.getConf().shortCircuitStreamsCacheExpiryMs);
230    this.cachingStrategy =
231        dfsClient.getDefaultReadCachingStrategy();
232    openInfo();
233  }
234
235  /**
236   * Grab the open-file info from namenode
237   */
238  synchronized void openInfo() throws IOException, UnresolvedLinkException {
239    lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
240    int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
241    while (retriesForLastBlockLength > 0) {
242      // Getting last block length as -1 is a special case. When cluster
243      // restarts, DNs may not report immediately. At this time partial block
244      // locations will not be available with NN for getting the length. Lets
245      // retry for 3 times to get the length.
246      if (lastBlockBeingWrittenLength == -1) {
247        DFSClient.LOG.warn("Last block locations not available. "
248            + "Datanodes might not have reported blocks completely."
249            + " Will retry for " + retriesForLastBlockLength + " times");
250        waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
251        lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
252      } else {
253        break;
254      }
255      retriesForLastBlockLength--;
256    }
257    if (retriesForLastBlockLength == 0) {
258      throw new IOException("Could not obtain the last block locations.");
259    }
260  }
261
262  private void waitFor(int waitTime) throws IOException {
263    try {
264      Thread.sleep(waitTime);
265    } catch (InterruptedException e) {
266      throw new IOException(
267          "Interrupted while getting the last block length.");
268    }
269  }
270
271  private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
272    final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
273    if (DFSClient.LOG.isDebugEnabled()) {
274      DFSClient.LOG.debug("newInfo = " + newInfo);
275    }
276    if (newInfo == null) {
277      throw new IOException("Cannot open filename " + src);
278    }
279
280    if (locatedBlocks != null) {
281      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
282      Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
283      while (oldIter.hasNext() && newIter.hasNext()) {
284        if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
285          throw new IOException("Blocklist for " + src + " has changed!");
286        }
287      }
288    }
289    locatedBlocks = newInfo;
290    long lastBlockBeingWrittenLength = 0;
291    if (!locatedBlocks.isLastBlockComplete()) {
292      final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
293      if (last != null) {
294        if (last.getLocations().length == 0) {
295          if (last.getBlockSize() == 0) {
296            // if the length is zero, then no data has been written to
297            // datanode. So no need to wait for the locations.
298            return 0;
299          }
300          return -1;
301        }
302        final long len = readBlockLength(last);
303        last.getBlock().setNumBytes(len);
304        lastBlockBeingWrittenLength = len; 
305      }
306    }
307
308    currentNode = null;
309    return lastBlockBeingWrittenLength;
310  }
311
312  /** Read the block length from one of the datanodes. */
313  private long readBlockLength(LocatedBlock locatedblock) throws IOException {
314    assert locatedblock != null : "LocatedBlock cannot be null";
315    int replicaNotFoundCount = locatedblock.getLocations().length;
316    
317    for(DatanodeInfo datanode : locatedblock.getLocations()) {
318      ClientDatanodeProtocol cdp = null;
319      
320      try {
321        cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
322            dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
323            dfsClient.getConf().connectToDnViaHostname, locatedblock);
324        
325        final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
326        
327        if (n >= 0) {
328          return n;
329        }
330      }
331      catch(IOException ioe) {
332        if (ioe instanceof RemoteException &&
333          (((RemoteException) ioe).unwrapRemoteException() instanceof
334            ReplicaNotFoundException)) {
335          // special case : replica might not be on the DN, treat as 0 length
336          replicaNotFoundCount--;
337        }
338        
339        if (DFSClient.LOG.isDebugEnabled()) {
340          DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
341              + datanode + " for block " + locatedblock.getBlock(), ioe);
342        }
343      } finally {
344        if (cdp != null) {
345          RPC.stopProxy(cdp);
346        }
347      }
348    }
349
350    // Namenode told us about these locations, but none know about the replica
351    // means that we hit the race between pipeline creation start and end.
352    // we require all 3 because some other exception could have happened
353    // on a DN that has it.  we want to report that error
354    if (replicaNotFoundCount == 0) {
355      return 0;
356    }
357
358    throw new IOException("Cannot obtain block length for " + locatedblock);
359  }
360  
361  public synchronized long getFileLength() {
362    return locatedBlocks == null? 0:
363        locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
364  }
365
366  // Short circuit local reads are forbidden for files that are
367  // under construction.  See HDFS-2757.
368  synchronized boolean shortCircuitForbidden() {
369    return locatedBlocks.isUnderConstruction();
370  }
371
372  /**
373   * Returns the datanode from which the stream is currently reading.
374   */
375  public DatanodeInfo getCurrentDatanode() {
376    return currentNode;
377  }
378
379  /**
380   * Returns the block containing the target position. 
381   */
382  synchronized public ExtendedBlock getCurrentBlock() {
383    if (currentLocatedBlock == null){
384      return null;
385    }
386    return currentLocatedBlock.getBlock();
387  }
388
389  /**
390   * Return collection of blocks that has already been located.
391   */
392  public synchronized List<LocatedBlock> getAllBlocks() throws IOException {
393    return getBlockRange(0, getFileLength());
394  }
395
396  /**
397   * Get block at the specified position.
398   * Fetch it from the namenode if not cached.
399   * 
400   * @param offset
401   * @param updatePosition whether to update current position
402   * @return located block
403   * @throws IOException
404   */
405  private synchronized LocatedBlock getBlockAt(long offset,
406      boolean updatePosition) throws IOException {
407    assert (locatedBlocks != null) : "locatedBlocks is null";
408
409    final LocatedBlock blk;
410
411    //check offset
412    if (offset < 0 || offset >= getFileLength()) {
413      throw new IOException("offset < 0 || offset >= getFileLength(), offset="
414          + offset
415          + ", updatePosition=" + updatePosition
416          + ", locatedBlocks=" + locatedBlocks);
417    }
418    else if (offset >= locatedBlocks.getFileLength()) {
419      // offset to the portion of the last block,
420      // which is not known to the name-node yet;
421      // getting the last block 
422      blk = locatedBlocks.getLastLocatedBlock();
423    }
424    else {
425      // search cached blocks first
426      int targetBlockIdx = locatedBlocks.findBlock(offset);
427      if (targetBlockIdx < 0) { // block is not cached
428        targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
429        // fetch more blocks
430        final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
431        assert (newBlocks != null) : "Could not find target position " + offset;
432        locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
433      }
434      blk = locatedBlocks.get(targetBlockIdx);
435    }
436
437    // update current position
438    if (updatePosition) {
439      pos = offset;
440      blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1;
441      currentLocatedBlock = blk;
442    }
443    return blk;
444  }
445
446  /** Fetch a block from namenode and cache it */
447  private synchronized void fetchBlockAt(long offset) throws IOException {
448    int targetBlockIdx = locatedBlocks.findBlock(offset);
449    if (targetBlockIdx < 0) { // block is not cached
450      targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
451    }
452    // fetch blocks
453    final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
454    if (newBlocks == null) {
455      throw new IOException("Could not find target position " + offset);
456    }
457    locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
458  }
459
460  /**
461   * Get blocks in the specified range.
462   * Fetch them from the namenode if not cached. This function
463   * will not get a read request beyond the EOF.
464   * @param offset
465   * @param length
466   * @return consequent segment of located blocks
467   * @throws IOException
468   */
469  private synchronized List<LocatedBlock> getBlockRange(long offset, 
470                                                        long length) 
471                                                      throws IOException {
472    // getFileLength(): returns total file length
473    // locatedBlocks.getFileLength(): returns length of completed blocks
474    if (offset >= getFileLength()) {
475      throw new IOException("Offset: " + offset +
476        " exceeds file length: " + getFileLength());
477    }
478
479    final List<LocatedBlock> blocks;
480    final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
481    final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
482    final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
483
484    if (readOffsetWithinCompleteBlk) {
485      //get the blocks of finalized (completed) block range
486      blocks = getFinalizedBlockRange(offset, 
487        Math.min(length, lengthOfCompleteBlk - offset));
488    } else {
489      blocks = new ArrayList<LocatedBlock>(1);
490    }
491
492    // get the blocks from incomplete block range
493    if (readLengthPastCompleteBlk) {
494       blocks.add(locatedBlocks.getLastLocatedBlock());
495    }
496
497    return blocks;
498  }
499
500  /**
501   * Get blocks in the specified range.
502   * Includes only the complete blocks.
503   * Fetch them from the namenode if not cached.
504   */
505  private synchronized List<LocatedBlock> getFinalizedBlockRange(
506      long offset, long length) throws IOException {
507    assert (locatedBlocks != null) : "locatedBlocks is null";
508    List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
509    // search cached blocks first
510    int blockIdx = locatedBlocks.findBlock(offset);
511    if (blockIdx < 0) { // block is not cached
512      blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
513    }
514    long remaining = length;
515    long curOff = offset;
516    while(remaining > 0) {
517      LocatedBlock blk = null;
518      if(blockIdx < locatedBlocks.locatedBlockCount())
519        blk = locatedBlocks.get(blockIdx);
520      if (blk == null || curOff < blk.getStartOffset()) {
521        LocatedBlocks newBlocks;
522        newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
523        locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
524        continue;
525      }
526      assert curOff >= blk.getStartOffset() : "Block not found";
527      blockRange.add(blk);
528      long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
529      remaining -= bytesRead;
530      curOff += bytesRead;
531      blockIdx++;
532    }
533    return blockRange;
534  }
535
536  /**
537   * Open a DataInputStream to a DataNode so that it can be read from.
538   * We get block ID and the IDs of the destinations at startup, from the namenode.
539   */
540  private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
541    if (target >= getFileLength()) {
542      throw new IOException("Attempted to read past end of file");
543    }
544
545    // Will be getting a new BlockReader.
546    if (blockReader != null) {
547      blockReader.close();
548      blockReader = null;
549    }
550
551    //
552    // Connect to best DataNode for desired Block, with potential offset
553    //
554    DatanodeInfo chosenNode = null;
555    int refetchToken = 1; // only need to get a new access token once
556    int refetchEncryptionKey = 1; // only need to get a new encryption key once
557    
558    boolean connectFailedOnce = false;
559
560    while (true) {
561      //
562      // Compute desired block
563      //
564      LocatedBlock targetBlock = getBlockAt(target, true);
565      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
566      long offsetIntoBlock = target - targetBlock.getStartOffset();
567
568      DNAddrPair retval = chooseDataNode(targetBlock);
569      chosenNode = retval.info;
570      InetSocketAddress targetAddr = retval.addr;
571
572      try {
573        ExtendedBlock blk = targetBlock.getBlock();
574        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
575        blockReader = getBlockReader(targetAddr, chosenNode, src, blk,
576            accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock,
577            buffersize, verifyChecksum, dfsClient.clientName, cachingStrategy);
578        if(connectFailedOnce) {
579          DFSClient.LOG.info("Successfully connected to " + targetAddr +
580                             " for " + blk);
581        }
582        return chosenNode;
583      } catch (AccessControlException ex) {
584        DFSClient.LOG.warn("Short circuit access failed " + ex);
585        dfsClient.disableLegacyBlockReaderLocal();
586        continue;
587      } catch (IOException ex) {
588        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
589          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
590              + "encryption key was invalid when connecting to " + targetAddr
591              + " : " + ex);
592          // The encryption key used is invalid.
593          refetchEncryptionKey--;
594          dfsClient.clearDataEncryptionKey();
595        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
596          refetchToken--;
597          fetchBlockAt(target);
598        } else {
599          connectFailedOnce = true;
600          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
601            + ", add to deadNodes and continue. " + ex, ex);
602          // Put chosen node into dead list, continue
603          addToDeadNodes(chosenNode);
604        }
605      }
606    }
607  }
608
609  /**
610   * Close it down!
611   */
612  @Override
613  public synchronized void close() throws IOException {
614    if (closed) {
615      return;
616    }
617    dfsClient.checkOpen();
618
619    if (!extendedReadBuffers.isEmpty()) {
620      final StringBuilder builder = new StringBuilder();
621      extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
622        private String prefix = "";
623        @Override
624        public void accept(ByteBuffer k, Object v) {
625          builder.append(prefix).append(k);
626          prefix = ", ";
627        }
628      });
629      DFSClient.LOG.warn("closing file " + src + ", but there are still " +
630          "unreleased ByteBuffers allocated by read().  " +
631          "Please release " + builder.toString() + ".");
632    }
633    if (blockReader != null) {
634      blockReader.close();
635      blockReader = null;
636    }
637    super.close();
638    fileInputStreamCache.close();
639    closed = true;
640  }
641
642  @Override
643  public synchronized int read() throws IOException {
644    int ret = read( oneByteBuf, 0, 1 );
645    return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
646  }
647
648  /**
649   * Wraps different possible read implementations so that readBuffer can be
650   * strategy-agnostic.
651   */
652  private interface ReaderStrategy {
653    public int doRead(BlockReader blockReader, int off, int len,
654        ReadStatistics readStatistics) throws ChecksumException, IOException;
655  }
656
657  private static void updateReadStatistics(ReadStatistics readStatistics, 
658        int nRead, BlockReader blockReader) {
659    if (nRead <= 0) return;
660    if (blockReader.isShortCircuit()) {
661      readStatistics.totalBytesRead += nRead;
662      readStatistics.totalLocalBytesRead += nRead;
663      readStatistics.totalShortCircuitBytesRead += nRead;
664    } else if (blockReader.isLocal()) {
665      readStatistics.totalBytesRead += nRead;
666      readStatistics.totalLocalBytesRead += nRead;
667    } else {
668      readStatistics.totalBytesRead += nRead;
669    }
670  }
671  
672  /**
673   * Used to read bytes into a byte[]
674   */
675  private static class ByteArrayStrategy implements ReaderStrategy {
676    final byte[] buf;
677
678    public ByteArrayStrategy(byte[] buf) {
679      this.buf = buf;
680    }
681
682    @Override
683    public int doRead(BlockReader blockReader, int off, int len,
684            ReadStatistics readStatistics) throws ChecksumException, IOException {
685        int nRead = blockReader.read(buf, off, len);
686        updateReadStatistics(readStatistics, nRead, blockReader);
687        return nRead;
688    }
689  }
690
691  /**
692   * Used to read bytes into a user-supplied ByteBuffer
693   */
694  private static class ByteBufferStrategy implements ReaderStrategy {
695    final ByteBuffer buf;
696    ByteBufferStrategy(ByteBuffer buf) {
697      this.buf = buf;
698    }
699
700    @Override
701    public int doRead(BlockReader blockReader, int off, int len,
702        ReadStatistics readStatistics) throws ChecksumException, IOException {
703      int oldpos = buf.position();
704      int oldlimit = buf.limit();
705      boolean success = false;
706      try {
707        int ret = blockReader.read(buf);
708        success = true;
709        updateReadStatistics(readStatistics, ret, blockReader);
710        return ret;
711      } finally {
712        if (!success) {
713          // Reset to original state so that retries work correctly.
714          buf.position(oldpos);
715          buf.limit(oldlimit);
716        }
717      } 
718    }
719  }
720
721  /* This is a used by regular read() and handles ChecksumExceptions.
722   * name readBuffer() is chosen to imply similarity to readBuffer() in
723   * ChecksumFileSystem
724   */ 
725  private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
726      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
727      throws IOException {
728    IOException ioe;
729    
730    /* we retry current node only once. So this is set to true only here.
731     * Intention is to handle one common case of an error that is not a
732     * failure on datanode or client : when DataNode closes the connection
733     * since client is idle. If there are other cases of "non-errors" then
734     * then a datanode might be retried by setting this to true again.
735     */
736    boolean retryCurrentNode = true;
737
738    while (true) {
739      // retry as many times as seekToNewSource allows.
740      try {
741        return reader.doRead(blockReader, off, len, readStatistics);
742      } catch ( ChecksumException ce ) {
743        DFSClient.LOG.warn("Found Checksum error for "
744            + getCurrentBlock() + " from " + currentNode
745            + " at " + ce.getPos());        
746        ioe = ce;
747        retryCurrentNode = false;
748        // we want to remember which block replicas we have tried
749        addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
750            corruptedBlockMap);
751      } catch ( IOException e ) {
752        if (!retryCurrentNode) {
753          DFSClient.LOG.warn("Exception while reading from "
754              + getCurrentBlock() + " of " + src + " from "
755              + currentNode, e);
756        }
757        ioe = e;
758      }
759      boolean sourceFound = false;
760      if (retryCurrentNode) {
761        /* possibly retry the same node so that transient errors don't
762         * result in application level failures (e.g. Datanode could have
763         * closed the connection because the client is idle for too long).
764         */ 
765        sourceFound = seekToBlockSource(pos);
766      } else {
767        addToDeadNodes(currentNode);
768        sourceFound = seekToNewSource(pos);
769      }
770      if (!sourceFound) {
771        throw ioe;
772      }
773      retryCurrentNode = false;
774    }
775  }
776
777  private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
778    dfsClient.checkOpen();
779    if (closed) {
780      throw new IOException("Stream closed");
781    }
782    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
783      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
784    failures = 0;
785    if (pos < getFileLength()) {
786      int retries = 2;
787      while (retries > 0) {
788        try {
789          // currentNode can be left as null if previous read had a checksum
790          // error on the same block. See HDFS-3067
791          if (pos > blockEnd || currentNode == null) {
792            currentNode = blockSeekTo(pos);
793          }
794          int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
795          if (locatedBlocks.isLastBlockComplete()) {
796            realLen = (int) Math.min(realLen, locatedBlocks.getFileLength());
797          }
798          int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
799          
800          if (result >= 0) {
801            pos += result;
802          } else {
803            // got a EOS from reader though we expect more data on it.
804            throw new IOException("Unexpected EOS from the reader");
805          }
806          if (dfsClient.stats != null && result != -1) {
807            dfsClient.stats.incrementBytesRead(result);
808          }
809          return result;
810        } catch (ChecksumException ce) {
811          throw ce;            
812        } catch (IOException e) {
813          if (retries == 1) {
814            DFSClient.LOG.warn("DFS Read", e);
815          }
816          blockEnd = -1;
817          if (currentNode != null) { addToDeadNodes(currentNode); }
818          if (--retries == 0) {
819            throw e;
820          }
821        } finally {
822          // Check if need to report block replicas corruption either read
823          // was successful or ChecksumException occured.
824          reportCheckSumFailure(corruptedBlockMap, 
825              currentLocatedBlock.getLocations().length);
826        }
827      }
828    }
829    return -1;
830  }
831
832  /**
833   * Read the entire buffer.
834   */
835  @Override
836  public synchronized int read(final byte buf[], int off, int len) throws IOException {
837    ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
838
839    return readWithStrategy(byteArrayReader, off, len);
840  }
841
842  @Override
843  public synchronized int read(final ByteBuffer buf) throws IOException {
844    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
845
846    return readWithStrategy(byteBufferReader, 0, buf.remaining());
847  }
848
849
850  /**
851   * Add corrupted block replica into map.
852   * @param corruptedBlockMap 
853   */
854  private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
855      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
856    Set<DatanodeInfo> dnSet = null;
857    if((corruptedBlockMap.containsKey(blk))) {
858      dnSet = corruptedBlockMap.get(blk);
859    }else {
860      dnSet = new HashSet<DatanodeInfo>();
861    }
862    if (!dnSet.contains(node)) {
863      dnSet.add(node);
864      corruptedBlockMap.put(blk, dnSet);
865    }
866  }
867      
868  private DNAddrPair chooseDataNode(LocatedBlock block)
869    throws IOException {
870    while (true) {
871      DatanodeInfo[] nodes = block.getLocations();
872      try {
873        DatanodeInfo chosenNode = bestNode(nodes, deadNodes);
874        final String dnAddr =
875            chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
876        if (DFSClient.LOG.isDebugEnabled()) {
877          DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
878        }
879        InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
880        return new DNAddrPair(chosenNode, targetAddr);
881      } catch (IOException ie) {
882        String blockInfo = block.getBlock() + " file=" + src;
883        if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
884          throw new BlockMissingException(src, "Could not obtain block: " + blockInfo,
885                                          block.getStartOffset());
886        }
887        
888        if (nodes == null || nodes.length == 0) {
889          DFSClient.LOG.info("No node available for " + blockInfo);
890        }
891        DFSClient.LOG.info("Could not obtain " + block.getBlock()
892            + " from any node: " + ie
893            + ". Will get new block locations from namenode and retry...");
894        try {
895          // Introducing a random factor to the wait time before another retry.
896          // The wait time is dependent on # of failures and a random factor.
897          // At the first time of getting a BlockMissingException, the wait time
898          // is a random number between 0..3000 ms. If the first retry
899          // still fails, we will wait 3000 ms grace period before the 2nd retry.
900          // Also at the second retry, the waiting window is expanded to 6000 ms
901          // alleviating the request rate from the server. Similarly the 3rd retry
902          // will wait 6000ms grace period before retry and the waiting window is
903          // expanded to 9000ms. 
904          final int timeWindow = dfsClient.getConf().timeWindow;
905          double waitTime = timeWindow * failures +       // grace period for the last round of attempt
906            timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
907          DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
908          Thread.sleep((long)waitTime);
909        } catch (InterruptedException iex) {
910        }
911        deadNodes.clear(); //2nd option is to remove only nodes[blockId]
912        openInfo();
913        block = getBlockAt(block.getStartOffset(), false);
914        failures++;
915        continue;
916      }
917    }
918  } 
919      
920  private void fetchBlockByteRange(LocatedBlock block, long start, long end,
921      byte[] buf, int offset,
922      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
923      throws IOException {
924    //
925    // Connect to best DataNode for desired Block, with potential offset
926    //
927    int refetchToken = 1; // only need to get a new access token once
928    int refetchEncryptionKey = 1; // only need to get a new encryption key once
929    
930    while (true) {
931      // cached block locations may have been updated by chooseDataNode()
932      // or fetchBlockAt(). Always get the latest list of locations at the 
933      // start of the loop.
934      CachingStrategy curCachingStrategy;
935      synchronized (this) {
936        block = getBlockAt(block.getStartOffset(), false);
937        curCachingStrategy = cachingStrategy;
938      }
939      DNAddrPair retval = chooseDataNode(block);
940      DatanodeInfo chosenNode = retval.info;
941      InetSocketAddress targetAddr = retval.addr;
942      BlockReader reader = null;
943          
944      try {
945        Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
946            
947        int len = (int) (end - start + 1);
948        reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(),
949            blockToken, start, len, buffersize, verifyChecksum,
950            dfsClient.clientName, curCachingStrategy);
951        int nread = reader.readAll(buf, offset, len);
952        if (nread != len) {
953          throw new IOException("truncated return from reader.read(): " +
954                                "excpected " + len + ", got " + nread);
955        }
956        return;
957      } catch (ChecksumException e) {
958        DFSClient.LOG.warn("fetchBlockByteRange(). Got a checksum exception for " +
959                 src + " at " + block.getBlock() + ":" + 
960                 e.getPos() + " from " + chosenNode);
961        // we want to remember what we have tried
962        addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
963      } catch (AccessControlException ex) {
964        DFSClient.LOG.warn("Short circuit access failed " + ex);
965        dfsClient.disableLegacyBlockReaderLocal();
966        continue;
967      } catch (IOException e) {
968        if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
969          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
970              + "encryption key was invalid when connecting to " + targetAddr
971              + " : " + e);
972          // The encryption key used is invalid.
973          refetchEncryptionKey--;
974          dfsClient.clearDataEncryptionKey();
975          continue;
976        } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
977          refetchToken--;
978          fetchBlockAt(block.getStartOffset());
979          continue;
980        } else {
981          DFSClient.LOG.warn("Failed to connect to " + targetAddr + 
982              " for file " + src + " for block " + block.getBlock() + ":" + e);
983          if (DFSClient.LOG.isDebugEnabled()) {
984            DFSClient.LOG.debug("Connection failure ", e);
985          }
986        }
987      } finally {
988        if (reader != null) {
989          reader.close();
990        }
991      }
992      // Put chosen node into dead list, continue
993      addToDeadNodes(chosenNode);
994    }
995  }
996
997  /**
998   * Should the block access token be refetched on an exception
999   * 
1000   * @param ex Exception received
1001   * @param targetAddr Target datanode address from where exception was received
1002   * @return true if block access token has expired or invalid and it should be
1003   *         refetched
1004   */
1005  private static boolean tokenRefetchNeeded(IOException ex,
1006      InetSocketAddress targetAddr) {
1007    /*
1008     * Get a new access token and retry. Retry is needed in 2 cases. 1)
1009     * When both NN and DN re-started while DFSClient holding a cached
1010     * access token. 2) In the case that NN fails to update its
1011     * access key at pre-set interval (by a wide margin) and
1012     * subsequently restarts. In this case, DN re-registers itself with
1013     * NN and receives a new access key, but DN will delete the old
1014     * access key from its memory since it's considered expired based on
1015     * the estimated expiration date.
1016     */
1017    if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1018      DFSClient.LOG.info("Access token was invalid when connecting to "
1019          + targetAddr + " : " + ex);
1020      return true;
1021    }
1022    return false;
1023  }
1024
1025  private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
1026    Peer peer = null;
1027    boolean success = false;
1028    Socket sock = null;
1029    try {
1030      sock = dfsClient.socketFactory.createSocket();
1031      NetUtils.connect(sock, addr,
1032        dfsClient.getRandomLocalInterfaceAddr(),
1033        dfsClient.getConf().socketTimeout);
1034      peer = TcpPeerServer.peerFromSocketAndKey(sock, 
1035          dfsClient.getDataEncryptionKey());
1036      success = true;
1037      return peer;
1038    } finally {
1039      if (!success) {
1040        IOUtils.closeQuietly(peer);
1041        IOUtils.closeQuietly(sock);
1042      }
1043    }
1044  }
1045
1046  /**
1047   * Retrieve a BlockReader suitable for reading.
1048   * This method will reuse the cached connection to the DN if appropriate.
1049   * Otherwise, it will create a new connection.
1050   * Throwing an IOException from this method is basically equivalent to 
1051   * declaring the DataNode bad, so we try to connect a lot of different ways
1052   * before doing that.
1053   *
1054   * @param dnAddr  Address of the datanode
1055   * @param chosenNode Chosen datanode information
1056   * @param file  File location
1057   * @param block  The Block object
1058   * @param blockToken  The access token for security
1059   * @param startOffset  The read offset, relative to block head
1060   * @param len  The number of bytes to read
1061   * @param bufferSize  The IO buffer size (not the client buffer size)
1062   * @param verifyChecksum  Whether to verify checksum
1063   * @param clientName  Client name
1064   * @param CachingStrategy  caching strategy to use
1065   * @return New BlockReader instance
1066   */
1067  protected BlockReader getBlockReader(InetSocketAddress dnAddr,
1068                                       DatanodeInfo chosenNode,
1069                                       String file,
1070                                       ExtendedBlock block,
1071                                       Token<BlockTokenIdentifier> blockToken,
1072                                       long startOffset,
1073                                       long len,
1074                                       int bufferSize,
1075                                       boolean verifyChecksum,
1076                                       String clientName,
1077                                       CachingStrategy curCachingStrategy)
1078      throws IOException {
1079    // Firstly, we check to see if we have cached any file descriptors for
1080    // local blocks.  If so, we can just re-use those file descriptors.
1081    FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block);
1082    if (fis != null) {
1083      if (DFSClient.LOG.isDebugEnabled()) {
1084        DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
1085            "the FileInputStreamCache.");
1086      }
1087      return new BlockReaderLocal.Builder(dfsClient.getConf()).
1088          setFilename(file).
1089          setBlock(block).
1090          setStartOffset(startOffset).
1091          setStreams(fis).
1092          setDatanodeID(chosenNode).
1093          setVerifyChecksum(verifyChecksum).
1094          setBlockMetadataHeader(BlockMetadataHeader.
1095              preadHeader(fis[1].getChannel())).
1096          setFileInputStreamCache(fileInputStreamCache).
1097          setCachingStrategy(curCachingStrategy).
1098          build();
1099    }
1100    
1101    // If the legacy local block reader is enabled and we are reading a local
1102    // block, try to create a BlockReaderLocalLegacy.  The legacy local block
1103    // reader implements local reads in the style first introduced by HDFS-2246.
1104    if ((dfsClient.useLegacyBlockReaderLocal()) &&
1105        DFSClient.isLocalAddress(dnAddr) &&
1106        (!shortCircuitForbidden())) {
1107      try {
1108        return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient,
1109            clientName, block, blockToken, chosenNode, startOffset);
1110      } catch (IOException e) {
1111        DFSClient.LOG.warn("error creating legacy BlockReaderLocal.  " +
1112            "Disabling legacy local reads.", e);
1113        dfsClient.disableLegacyBlockReaderLocal();
1114      }
1115    }
1116
1117    // Look for cached domain peers.
1118    int cacheTries = 0;
1119    DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory();
1120    BlockReader reader = null;
1121    final int nCachedConnRetry = dfsClient.getConf().nCachedConnRetry;
1122    for (; cacheTries < nCachedConnRetry; ++cacheTries) {
1123      Peer peer = peerCache.get(chosenNode, true);
1124      if (peer == null) break;
1125      try {
1126        boolean allowShortCircuitLocalReads = dfsClient.getConf().
1127            shortCircuitLocalReads && (!shortCircuitForbidden());
1128        reader = BlockReaderFactory.newBlockReader(
1129            dfsClient.getConf(), file, block, blockToken, startOffset,
1130            len, verifyChecksum, clientName, peer, chosenNode, 
1131            dsFactory, peerCache, fileInputStreamCache,
1132            allowShortCircuitLocalReads, curCachingStrategy);
1133        return reader;
1134      } catch (IOException ex) {
1135        DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
1136            "Closing stale " + peer, ex);
1137      } finally {
1138        if (reader == null) {
1139          IOUtils.closeQuietly(peer);
1140        }
1141      }
1142    }
1143
1144    // Try to create a DomainPeer.
1145    DomainSocket domSock = dsFactory.create(dnAddr, this);
1146    if (domSock != null) {
1147      Peer peer = new DomainPeer(domSock);
1148      try {
1149        boolean allowShortCircuitLocalReads = dfsClient.getConf().
1150            shortCircuitLocalReads && (!shortCircuitForbidden());
1151        reader = BlockReaderFactory.newBlockReader(
1152            dfsClient.getConf(), file, block, blockToken, startOffset,
1153            len, verifyChecksum, clientName, peer, chosenNode,
1154            dsFactory, peerCache, fileInputStreamCache,
1155            allowShortCircuitLocalReads, curCachingStrategy);
1156        return reader;
1157      } catch (IOException e) {
1158        DFSClient.LOG.warn("failed to connect to " + domSock, e);
1159      } finally {
1160        if (reader == null) {
1161         // If the Peer that we got the error from was a DomainPeer,
1162         // mark the socket path as bad, so that newDataSocket will not try 
1163         // to re-open this socket for a while.
1164         dsFactory.disableDomainSocketPath(domSock.getPath());
1165         IOUtils.closeQuietly(peer);
1166        }
1167      }
1168    }
1169
1170    // Look for cached peers.
1171    for (; cacheTries < nCachedConnRetry; ++cacheTries) {
1172      Peer peer = peerCache.get(chosenNode, false);
1173      if (peer == null) break;
1174      try {
1175        reader = BlockReaderFactory.newBlockReader(
1176            dfsClient.getConf(), file, block, blockToken, startOffset,
1177            len, verifyChecksum, clientName, peer, chosenNode, 
1178            dsFactory, peerCache, fileInputStreamCache, false,
1179            curCachingStrategy);
1180        return reader;
1181      } catch (IOException ex) {
1182        DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
1183          peer, ex);
1184      } finally {
1185        if (reader == null) {
1186          IOUtils.closeQuietly(peer);
1187        }
1188      }
1189    }
1190    if (tcpReadsDisabledForTesting) {
1191      throw new IOException("TCP reads are disabled.");
1192    }
1193    // Try to create a new remote peer.
1194    Peer peer = newTcpPeer(dnAddr);
1195    try {
1196      reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file,
1197          block, blockToken, startOffset, len, verifyChecksum, clientName,
1198          peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false,
1199        curCachingStrategy);
1200      return reader;
1201    } catch (IOException ex) {
1202      DFSClient.LOG.debug(
1203          "Exception while getting block reader, closing stale " + peer, ex);
1204      throw ex;
1205    } finally {
1206      if (reader == null) {
1207        IOUtils.closeQuietly(peer);
1208  }
1209    }
1210  }
1211
1212
1213  /**
1214   * Read bytes starting from the specified position.
1215   * 
1216   * @param position start read from this position
1217   * @param buffer read buffer
1218   * @param offset offset into buffer
1219   * @param length number of bytes to read
1220   * 
1221   * @return actual number of bytes read
1222   */
1223  @Override
1224  public int read(long position, byte[] buffer, int offset, int length)
1225    throws IOException {
1226    // sanity checks
1227    dfsClient.checkOpen();
1228    if (closed) {
1229      throw new IOException("Stream closed");
1230    }
1231    failures = 0;
1232    long filelen = getFileLength();
1233    if ((position < 0) || (position >= filelen)) {
1234      return -1;
1235    }
1236    int realLen = length;
1237    if ((position + length) > filelen) {
1238      realLen = (int)(filelen - position);
1239    }
1240    
1241    // determine the block and byte range within the block
1242    // corresponding to position and realLen
1243    List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1244    int remaining = realLen;
1245    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1246      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1247    for (LocatedBlock blk : blockRange) {
1248      long targetStart = position - blk.getStartOffset();
1249      long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1250      try {
1251        fetchBlockByteRange(blk, targetStart, 
1252            targetStart + bytesToRead - 1, buffer, offset, corruptedBlockMap);
1253      } finally {
1254        // Check and report if any block replicas are corrupted.
1255        // BlockMissingException may be caught if all block replicas are
1256        // corrupted.
1257        reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1258      }
1259
1260      remaining -= bytesToRead;
1261      position += bytesToRead;
1262      offset += bytesToRead;
1263    }
1264    assert remaining == 0 : "Wrong number of bytes read.";
1265    if (dfsClient.stats != null) {
1266      dfsClient.stats.incrementBytesRead(realLen);
1267    }
1268    return realLen;
1269  }
1270  
1271  /**
1272   * DFSInputStream reports checksum failure.
1273   * Case I : client has tried multiple data nodes and at least one of the
1274   * attempts has succeeded. We report the other failures as corrupted block to
1275   * namenode. 
1276   * Case II: client has tried out all data nodes, but all failed. We
1277   * only report if the total number of replica is 1. We do not
1278   * report otherwise since this maybe due to the client is a handicapped client
1279   * (who can not read).
1280   * @param corruptedBlockMap map of corrupted blocks
1281   * @param dataNodeCount number of data nodes who contains the block replicas
1282   */
1283  private void reportCheckSumFailure(
1284      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1285      int dataNodeCount) {
1286    if (corruptedBlockMap.isEmpty()) {
1287      return;
1288    }
1289    Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1290        .entrySet().iterator();
1291    Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1292    ExtendedBlock blk = entry.getKey();
1293    Set<DatanodeInfo> dnSet = entry.getValue();
1294    if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1295        || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1296      DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1297      int i = 0;
1298      for (DatanodeInfo dn:dnSet) {
1299        locs[i++] = dn;
1300      }
1301      LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1302      dfsClient.reportChecksumFailure(src, lblocks);
1303    }
1304    corruptedBlockMap.clear();
1305  }
1306
1307  @Override
1308  public long skip(long n) throws IOException {
1309    if ( n > 0 ) {
1310      long curPos = getPos();
1311      long fileLen = getFileLength();
1312      if( n+curPos > fileLen ) {
1313        n = fileLen - curPos;
1314      }
1315      seek(curPos+n);
1316      return n;
1317    }
1318    return n < 0 ? -1 : 0;
1319  }
1320
1321  /**
1322   * Seek to a new arbitrary location
1323   */
1324  @Override
1325  public synchronized void seek(long targetPos) throws IOException {
1326    if (targetPos > getFileLength()) {
1327      throw new IOException("Cannot seek after EOF");
1328    }
1329    if (targetPos < 0) {
1330      throw new IOException("Cannot seek to negative offset");
1331    }
1332    if (closed) {
1333      throw new IOException("Stream is closed!");
1334    }
1335    boolean done = false;
1336    if (pos <= targetPos && targetPos <= blockEnd) {
1337      //
1338      // If this seek is to a positive position in the current
1339      // block, and this piece of data might already be lying in
1340      // the TCP buffer, then just eat up the intervening data.
1341      //
1342      int diff = (int)(targetPos - pos);
1343      if (diff <= blockReader.available()) {
1344        try {
1345          pos += blockReader.skip(diff);
1346          if (pos == targetPos) {
1347            done = true;
1348          }
1349        } catch (IOException e) {//make following read to retry
1350          if(DFSClient.LOG.isDebugEnabled()) {
1351            DFSClient.LOG.debug("Exception while seek to " + targetPos
1352                + " from " + getCurrentBlock() + " of " + src + " from "
1353                + currentNode, e);
1354          }
1355        }
1356      }
1357    }
1358    if (!done) {
1359      pos = targetPos;
1360      blockEnd = -1;
1361    }
1362  }
1363
1364  /**
1365   * Same as {@link #seekToNewSource(long)} except that it does not exclude
1366   * the current datanode and might connect to the same node.
1367   */
1368  private synchronized boolean seekToBlockSource(long targetPos)
1369                                                 throws IOException {
1370    currentNode = blockSeekTo(targetPos);
1371    return true;
1372  }
1373  
1374  /**
1375   * Seek to given position on a node other than the current node.  If
1376   * a node other than the current node is found, then returns true. 
1377   * If another node could not be found, then returns false.
1378   */
1379  @Override
1380  public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1381    boolean markedDead = deadNodes.containsKey(currentNode);
1382    addToDeadNodes(currentNode);
1383    DatanodeInfo oldNode = currentNode;
1384    DatanodeInfo newNode = blockSeekTo(targetPos);
1385    if (!markedDead) {
1386      /* remove it from deadNodes. blockSeekTo could have cleared 
1387       * deadNodes and added currentNode again. Thats ok. */
1388      deadNodes.remove(oldNode);
1389    }
1390    if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1391      currentNode = newNode;
1392      return true;
1393    } else {
1394      return false;
1395    }
1396  }
1397      
1398  /**
1399   */
1400  @Override
1401  public synchronized long getPos() throws IOException {
1402    return pos;
1403  }
1404
1405  /** Return the size of the remaining available bytes
1406   * if the size is less than or equal to {@link Integer#MAX_VALUE},
1407   * otherwise, return {@link Integer#MAX_VALUE}.
1408   */
1409  @Override
1410  public synchronized int available() throws IOException {
1411    if (closed) {
1412      throw new IOException("Stream closed");
1413    }
1414
1415    final long remaining = getFileLength() - pos;
1416    return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1417  }
1418
1419  /**
1420   * We definitely don't support marks
1421   */
1422  @Override
1423  public boolean markSupported() {
1424    return false;
1425  }
1426  @Override
1427  public void mark(int readLimit) {
1428  }
1429  @Override
1430  public void reset() throws IOException {
1431    throw new IOException("Mark/reset not supported");
1432  }
1433
1434  /**
1435   * Pick the best node from which to stream the data.
1436   * Entries in <i>nodes</i> are already in the priority order
1437   */
1438  static DatanodeInfo bestNode(DatanodeInfo nodes[], 
1439                               AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes)
1440                               throws IOException {
1441    if (nodes != null) { 
1442      for (int i = 0; i < nodes.length; i++) {
1443        if (!deadNodes.containsKey(nodes[i])) {
1444          return nodes[i];
1445        }
1446      }
1447    }
1448    throw new IOException("No live nodes contain current block");
1449  }
1450
1451  /** Utility class to encapsulate data node info and its address. */
1452  static class DNAddrPair {
1453    DatanodeInfo info;
1454    InetSocketAddress addr;
1455    DNAddrPair(DatanodeInfo info, InetSocketAddress addr) {
1456      this.info = info;
1457      this.addr = addr;
1458    }
1459  }
1460
1461  /**
1462   * Get statistics about the reads which this DFSInputStream has done.
1463   */
1464  public synchronized ReadStatistics getReadStatistics() {
1465    return new ReadStatistics(readStatistics);
1466  }
1467
1468  private synchronized void closeCurrentBlockReader() {
1469    if (blockReader == null) return;
1470    // Close the current block reader so that the new caching settings can 
1471    // take effect immediately.
1472    try {
1473      blockReader.close();
1474    } catch (IOException e) {
1475      DFSClient.LOG.error("error closing blockReader", e);
1476    }
1477    blockReader = null;
1478  }
1479
1480  @Override
1481  public synchronized void setReadahead(Long readahead)
1482      throws IOException {
1483    this.cachingStrategy =
1484        new CachingStrategy.Builder(this.cachingStrategy).
1485            setReadahead(readahead).build();
1486    closeCurrentBlockReader();
1487  }
1488
1489  @Override
1490  public synchronized void setDropBehind(Boolean dropBehind)
1491      throws IOException {
1492    this.cachingStrategy =
1493        new CachingStrategy.Builder(this.cachingStrategy).
1494            setDropBehind(dropBehind).build();
1495    closeCurrentBlockReader();
1496  }
1497
1498  @Override
1499  public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1500      int maxLength, EnumSet<ReadOption> opts) 
1501          throws IOException, UnsupportedOperationException {
1502    assert(maxLength > 0);
1503    if (((blockReader == null) || (blockEnd == -1)) &&
1504          (pos < getFileLength())) {
1505      /*
1506       * If we don't have a blockReader, or the one we have has no more bytes
1507       * left to read, we call seekToBlockSource to get a new blockReader and
1508       * recalculate blockEnd.  Note that we assume we're not at EOF here
1509       * (we check this above).
1510       */
1511      if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1512        throw new IOException("failed to allocate new BlockReader " +
1513            "at position " + pos);
1514      }
1515    }
1516    ByteBuffer buffer = tryReadZeroCopy(maxLength, opts);
1517    if (buffer != null) {
1518      return buffer;
1519    }
1520    buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1521    if (buffer != null) {
1522      extendedReadBuffers.put(buffer, bufferPool);
1523    }
1524    return buffer;
1525  }
1526
1527  private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1528      EnumSet<ReadOption> opts) throws IOException {
1529    // Java ByteBuffers can't be longer than 2 GB, because they use
1530    // 4-byte signed integers to represent capacity, etc.
1531    // So we can't mmap the parts of the block higher than the 2 GB offset.
1532    // FIXME: we could work around this with multiple memory maps.
1533    // See HDFS-5101.
1534    long blockEnd32 = Math.min(Integer.MAX_VALUE, blockEnd);
1535    long curPos = pos;
1536    long blockLeft = blockEnd32 - curPos + 1;
1537    if (blockLeft <= 0) {
1538      if (DFSClient.LOG.isDebugEnabled()) {
1539        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1540          curPos + " of " + src + "; blockLeft = " + blockLeft +
1541          "; blockEnd32 = " + blockEnd32 + ", blockEnd = " + blockEnd +
1542          "; maxLength = " + maxLength);
1543      }
1544      return null;
1545    }
1546    int length = Math.min((int)blockLeft, maxLength);
1547    long blockStartInFile = currentLocatedBlock.getStartOffset();
1548    long blockPos = curPos - blockStartInFile;
1549    long limit = blockPos + length;
1550    ClientMmap clientMmap =
1551        blockReader.getClientMmap(opts, dfsClient.getMmapManager());
1552    if (clientMmap == null) {
1553      if (DFSClient.LOG.isDebugEnabled()) {
1554        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1555          curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1556          "null.");
1557      }
1558      return null;
1559    }
1560    seek(pos + length);
1561    ByteBuffer buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1562    buffer.position((int)blockPos);
1563    buffer.limit((int)limit);
1564    clientMmap.ref();
1565    extendedReadBuffers.put(buffer, clientMmap);
1566    readStatistics.addZeroCopyBytes(length);
1567    if (DFSClient.LOG.isDebugEnabled()) {
1568      DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " +
1569          "offset " + curPos + " via the zero-copy read path.  " +
1570          "blockEnd = " + blockEnd);
1571    }
1572    return buffer;
1573  }
1574
1575  @Override
1576  public synchronized void releaseBuffer(ByteBuffer buffer) {
1577    Object val = extendedReadBuffers.remove(buffer);
1578    if (val == null) {
1579      throw new IllegalArgumentException("tried to release a buffer " +
1580          "that was not created by this stream, " + buffer);
1581    }
1582    if (val instanceof ClientMmap) {
1583      ((ClientMmap)val).unref();
1584    } else if (val instanceof ByteBufferPool) {
1585      ((ByteBufferPool)val).putBuffer(buffer);
1586    }
1587  }
1588}