NameNodeRpcServer.create方法:namenode所有的rpc响应都由NameNodeRpcServer来处理
检查namenode的状态
checkNNStartup(); private void checkNNStartup() throws IOException { if (!this.nn.isStarted()) { String message = NameNode.composeNotStartedMessage(this.nn.getRole()); throw new RetriableException(message); } }获取client的IP地址
String clientMachine = getClientMachine(); private static String getClientMachine() { String clientMachine = Server.getRemoteAddress(); if (clientMachine == null) { //not a RPC client clientMachine = ""; } return clientMachine; }校验文件srcPath的length及depth srcPath最大length为:MAX_PATH_LENGTH = 8000 srcPath最大depth: MAX_PATH_DEPTH = 1000
if (!checkPathLength(src)) { throw new IOException("create: Pathname too long. Limit " + MAX_PATH_LENGTH + " characters, " + MAX_PATH_DEPTH + " levels."); } private boolean checkPathLength(String src) { Path srcPath = new Path(src); return (src.length() <= MAX_PATH_LENGTH && srcPath.depth() <= MAX_PATH_DEPTH); }验证当前状态下是否允许给定的操作类别.主要针对开启HA的集群
namesystem.checkOperation(OperationCategory.WRITE); public void checkOperation(OperationCategory op) throws StandbyException { if (haContext != null) { // null in some unit tests haContext.checkOperation(op); } }以上代码的时序图:
首先查看缓存中是否已经有了相同rpc调用的结果,如果缓存中有直接返回结果。
CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache, null); if (cacheEntry != null && cacheEntry.isSuccess()) { return (HdfsFileStatus) cacheEntry.getPayload(); }缓存中没有,则调用FSNamesystem的startFile方法在namespace中创建文件,创建成功后将返回值HdfsFileStatus对象返回给客户端。FSNamesystem.startFile方法中直接调用了FSNamesystem.startFileInt方法
status = namesystem.startFile(src, perm, clientName, clientMachine, flag.get(), createParent, replication, blockSize, supportedVersions, ecPolicyName, cacheEntry != null); metrics.incrFilesCreated(); metrics.incrCreateFileOps(); return status;HdfsFileStatus创建过程:
FSNamesystem.startFileInt方法
首先判断src是否有效
if (!DFSUtil.isValidName(src) || FSDirectory.isExactReservedName(src) || (FSDirectory.isReservedName(src) && !FSDirectory.isReservedRawName(src) && !FSDirectory.isReservedInodesName(src))) { throw new InvalidPathException(src); } public static boolean isValidName(String src) { // Path must be absolute.必须是绝对路径,src必须以‘/’开头。 if (!src.startsWith(Path.SEPARATOR)) { return false; } // Check for ".." "." ":" "/" String[] components = StringUtils.split(src, '/'); for (int i = 0; i < components.length; i++) { String element = components[i]; if (element.equals(".") || (element.contains(":")) || (element.contains("/"))) { return false; } // ".." is allowed in path starting with /.reserved/.inodes if (element.equals("..")) { if (components.length > 4 && components[1].equals(".reserved") && components[2].equals(".inodes")) { continue; } return false; } // The string may start or end with a /, but not have "//" in the middle. // 路径中间不能有‘//’ if (element.isEmpty() && i != components.length - 1 && i != 0) { return false; } } return true; } /* * CHECK_RESERVED_FILE_NAMES:true * DOT_RESERVED_PATH_PREFIX:"/.reserved" */ //src是不是"/.reserved" public static boolean isExactReservedName(String src) { return CHECK_RESERVED_FILE_NAMES && src.equals(DOT_RESERVED_PATH_PREFIX); } //src是不是以"/.reserved/"开头 public static boolean isReservedName(String src) { return src.startsWith(DOT_RESERVED_PATH_PREFIX + Path.SEPARATOR); } //src是不是以"/.reserved/raw"开头 static boolean isReservedRawName(String src) { return src.startsWith(DOT_RESERVED_PATH_PREFIX + Path.SEPARATOR + RAW_STRING); } //src是不是以"/.reserved/.inodes"开头 static boolean isReservedRawName(String src) { return src.startsWith(DOT_RESERVED_PATH_PREFIX + Path.SEPARATOR + RAW_STRING); }以上代码的时序图:
然后检查冗余方式:纠删码 or 副本
boolean shouldReplicate = flag.contains(CreateFlag.SHOULD_REPLICATE); if (shouldReplicate && (!org.apache.commons.lang.StringUtils.isEmpty(ecPolicyName))) { throw new HadoopIllegalArgumentException("SHOULD_REPLICATE flag and " + "ecPolicyName are exclusive parameters. Set both is not allowed!"); }验证当前状态是否允许写操作,并做权限检查
checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); //可重入的读写锁 //基于java.util.concurrent.locks.ReentrantReadWriteLock //写锁 writeLock();检查namenode是否处于safemode
checkNameNodeSafeMode("Cannot create file" + src); void checkNameNodeSafeMode(String errorMsg) throws RetriableException, SafeModeException { if (isInSafeMode()) { SafeModeException se = newSafemodeException(errorMsg); if (haEnabled && haContext != null && haContext.getState().getServiceState() == HAServiceState.ACTIVE && isInStartupSafeMode()) { throw new RetriableException(se); } else { throw se; } } }时序图如下:
构造INodesInPath 对象
iip = FSDirWriteFileOp.resolvePathForStartFile(dir, pc, src, flag, createParent); 参数: dir:[FSDirectory] The namespace tree pc:[FSPermissionChecker] dir.getPermissionChecker() src:[String] 客户端传来的参数 flag:[EnumSetWritable] CREATE((short) 0x01),OVERWRITE((short) 0x02),APPEND((short) 0x04)... createParent:[boolean]检查replication是否在正确范围内
if (shouldReplicate || (org.apache.commons.lang.StringUtils.isEmpty(ecPolicyName) && !FSDirErasureCodingOp.hasErasureCodingPolicy(this, iip))) { blockManager.verifyReplication(src, replication, clientMachine); } public void verifyReplication(String src, short replication, String clientName) throws IOException { String err = null; if (replication > maxReplication) { err = " exceeds maximum of " + maxReplication; } else if (replication < minReplication) { err = " is less than the required minimum of " + minReplication; } if (err != null) { throw new IOException("Requested replication factor of " + replication + err + " for " + src + (clientName == null? "": ", clientName=" + clientName)); } }检查blockSize
if (blockSize < minBlockSize) { throw new IOException("Specified block size is less than configured" + " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY + "): " + blockSize + " < " + minBlockSize); }获取文件加密信息(本次不做细究)
// provider = DFSUtil.createKeyProviderCryptoExtension(conf); if (!iip.isRaw() && provider != null) { EncryptionKeyInfo ezInfo = FSDirEncryptionZoneOp.getEncryptionKeyInfo( this, iip, supportedVersions); // if the path has an encryption zone, the lock was released while // generating the EDEK. re-resolve the path to ensure the namesystem // and/or EZ has not mutated if (ezInfo != null) { checkOperation(OperationCategory.WRITE); iip = FSDirWriteFileOp.resolvePathForStartFile( dir, pc, iip.getPath(), flag, createParent); feInfo = FSDirEncryptionZoneOp.getFileEncryptionInfo( dir, iip, ezInfo); } }调用FSDirWriteFileOp.startFile方法在namespace中创建文件
dir.writeLock(); ... stat = FSDirWriteFileOp.startFile(this, iip, permissions, holder, clientMachine, flag, createParent, replication, blockSize, feInfo, toRemoveBlocks, shouldReplicate, ecPolicyName, logRetryCache); ... dir.writeUnlock(); ... return stat;FSDirWriteFileOp.startFile方法:Create a new file or overwrite an existing file
检查参数
//是否覆盖 boolean overwrite = flag.contains(CreateFlag.OVERWRITE); //是否使用内存存储策略 boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);获取文件路径及namesapce中的目录树结构
final String src = iip.getPath(); FSDirectory fsd = fsn.getFSDirectory();如果文件已经存在
if (iip.getLastINode() != null) { //如果是覆盖写,则先删除 if (overwrite) { List<INode> toRemoveINodes = new ChunkedArrayList<>(); List<Long> toRemoveUCFiles = new ChunkedArrayList<>(); long ret = FSDirDeleteOp.delete(fsd, iip, toRemoveBlocks, toRemoveINodes, toRemoveUCFiles, now()); if (ret >= 0) { iip = INodesInPath.replace(iip, iip.length() - 1, null); FSDirDeleteOp.incrDeletedFileCount(ret); fsn.removeLeasesAndINodes(toRemoveUCFiles, toRemoveINodes, true); } } //如果不是覆盖写,则抛出文件已存在的异常信息 else { // If lease soft limit time is expired, recover the lease fsn.recoverLeaseInternal(FSNamesystem.RecoverLeaseOp.CREATE_FILE, iip, src, holder, clientMachine, false); throw new FileAlreadyExistsException(src + " for client " + clientMachine + " already exists"); } } static long delete(FSDirectory fsd, INodesInPath iip, BlocksMapUpdateInfo collectedBlocks, List<INode> removedINodes, List<Long> removedUCFiles, long mtime) throws IOException { if (NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("DIR* FSDirectory.delete: " + iip.getPath()); } long filesRemoved = -1; FSNamesystem fsn = fsd.getFSNamesystem(); fsd.writeLock(); try { //deleteAllowed:iip是null或者iip是跟目录的话返回false,否者返回true if (deleteAllowed(iip)) { List<INodeDirectory> snapshottableDirs = new ArrayList<>(); //todo FSDirSnapshotOp.checkSnapshot(fsd, iip, snapshottableDirs); ReclaimContext context = new ReclaimContext( fsd.getBlockStoragePolicySuite(), collectedBlocks, removedINodes, removedUCFiles); if (unprotectedDelete(fsd, iip, context, mtime)) { filesRemoved = context.quotaDelta().getNsDelta(); } //todo fsd.updateReplicationFactor(context.collectedBlocks() .toUpdateReplicationInfo()); //todo fsn.removeSnapshottableDirs(snapshottableDirs); //todo fsd.updateCount(iip, context.quotaDelta(), false); } } finally { fsd.writeUnlock(); } return filesRemoved; }检查文件数是否超限
fsn.checkFsObjectLimit(); void checkFsObjectLimit() throws IOException { if (maxFsObjects != 0 && maxFsObjects <= dir.totalInodes() + getBlocksTotal()) { throw new IOException("Exceeded the configured number of objects " + maxFsObjects + " in the filesystem."); } }创建所有的父级目录
INodesInPath parent = FSDirMkdirOp.createAncestorDirectories(fsd, iip, permissions);目录创建流程:createAncestorDirectories函数中判断iip中是否存在未创建的目录,如果存在则调用createSingleDirectory遍历创建不存在的目录,createSingleDirectory方法调用unprotectedMkdir方法进行目录创建
private static INodesInPath unprotectedMkdir(FSDirectory fsd, long inodeId, INodesInPath parent, byte[] name, PermissionStatus permission, List<AclEntry> aclEntries, long timestamp) throws QuotaExceededException, AclException, FileAlreadyExistsException { assert fsd.hasWriteLock(); //确保父级节点存在 assert parent.getLastINode() != null; //确保父级节点是目录节点 if (!parent.getLastINode().isDirectory()) { throw new FileAlreadyExistsException("Parent path is not a directory: " + parent.getPath() + " " + DFSUtil.bytes2String(name)); } //创建一个目录 final INodeDirectory dir = new INodeDirectory(inodeId, name, permission, timestamp); //将创建的目录添加到namespace的目录树中 INodesInPath iip = fsd.addLastINode(parent, dir, permission.getPermission(), true); if (iip != null && aclEntries != null) { AclStorage.updateINodeAcl(dir, aclEntries, Snapshot.CURRENT_STATE_ID); } return iip; } }在namespace中创建文件
if (parent != null) { iip = addFile(fsd, parent, iip.getLastLocalName(), permissions, replication, blockSize, holder, clientMachine, shouldReplicate, ecPolicyName); newNode = iip != null ? iip.getLastINode().asFile() : null; } private static INodesInPath addFile( FSDirectory fsd, INodesInPath existing, byte[] localName, PermissionStatus permissions, short replication, long preferredBlockSize, String clientName, String clientMachine, boolean shouldReplicate, String ecPolicyName) throws IOException { //检查父级目录是否为空 Preconditions.checkNotNull(existing); long modTime = now(); INodesInPath newiip; fsd.writeLock(); try { ... INodeFile newNode = newINodeFile(fsd.allocateNewInodeId(), permissions, modTime, modTime, replicationFactor, ecPolicyID, preferredBlockSize, blockType); newNode.setLocalName(localName); newNode.toUnderConstruction(clientName, clientMachine); //向namespace树中添加文件 newiip = fsd.addINode(existing, newNode, permissions.getPermission()); } ... return newiip; } public final void addToInodeMap(INode inode) { if (inode instanceof INodeWithAdditionalFields) { //最终是将数据放到了INodeMap中 inodeMap.put(inode); if (!inode.isSymlink()) { final XAttrFeature xaf = inode.getXAttrFeature(); addEncryptionZone((INodeWithAdditionalFields) inode, xaf); } } }设置存储策略
setNewINodeStoragePolicy(fsd.getBlockManager(), iip, isLazyPersist);记录日志
fsd.getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry); //logOpenFile方法调用logEdit方法 void logEdit(final FSEditLogOp op) { boolean needsSync = false; synchronized (this) { assert isOpenForWrite() : "bad state: " + state; // wait if an automatic sync is scheduled waitIfAutoSyncScheduled(); // check if it is time to schedule an automatic sync needsSync = doEditTransaction(op); if (needsSync) { isAutoSyncScheduled = true; } } // Sync the log if an automatic sync is required. if (needsSync) { logSync(); } }返回结果
return FSDirStatAndListingOp.getFileInfo(fsd, iip, false, false);INodesInPath创建过程:
FSDirWriteFileOp.resolvePathForStartFile方法:
调用FSDirectory.resolvePath方法创建INodesInPath对象
INodesInPath iip = dir.resolvePath(pc, src, DirOp.CREATE); //下面是一些校验,校验不过抛出响应的异常 ...FSDirectory.resolvePath
将src拆分成字节数组
... byte[][] components = INode.getPathComponents(src); ... //调用INodesInPath.resolve方法创建INodesInPath对象 INodesInPath iip = INodesInPath.resolve(rootDir, components, isRaw); ...INodesInPath.resolve方法:
遍历components,创建INodesInPath
//跟目录 INode curNode = startingDir; int count = 0; int inodeNum = 0; INode[] inodes = new INode[components.length]; boolean isSnapshot = false; int snapshotId = CURRENT_STATE_ID; while (count < components.length && curNode != null) { // ... else { // normal case, and also for resolving file/dir under snapshot root //采用binarySearch方法查找childName在namespace中的位置,如果namespace中不存在返回null curNode = dir.getChild(childName,, isSnapshot ? snapshotId : CURRENT_STATE_ID); } } return new INodesInPath(inodes, components, isRaw, isSnapshot, snapshotId);