parser = argparse.ArgumentParser(description='Panoptic Deeplab')
parser.add_argument('--training_dataset', default='/home/ma-user/work/panoptic-deeplab/', help='Training dataset directory') # 在ModelArts中创建算法时,必须进行输入路径映射配置,输入映射路径的前缀必须是/home/work/modelarts/inputs/,作用是在启动训练时,将OBS的数据拷贝到这个本地路径中供本地代码使用。
parser.add_argument('--train_url', default='./output', help='the path to save training outputs') # 在ModelArts中创建训练作业时,必须指定OBS上的一个训练输出位置,训练结束时,会将输出映射路径拷贝到该位置
parser.add_argument('--num_gpus', default=1, type=int, help='num of GPUs to train')
parser.add_argument('--eval', default='False', help='whether to eval')
parser.add_argument('--load_weight', default='trained_model/model/model_final.pth',type=str) # obs路径 断点模型 pth文件 如果是评估 则是相对于src的路径
parser.add_argument('--iteration', default=100, type=int)
parser.add_argument('--learning_rate', default=0.001, type=float)
parser.add_argument('--ims_per_batch', default=8, type=int)
args, unknown = parser.parse_known_args() # 必须将parse_args改成parse_known_args,因为在ModelArts训练作业中运行时平台会传入一个额外的init_method的参数
# dir
fname = os.getcwd()
project_dir = os.path.join(fname, "panoptic-deeplab")
detectron2_dir = os.path.join(fname, "detectron2-0.3+cu102-cp36-cp36m-linux_x86_64.whl")
panopticapi_dir = os.path.join(fname, "panopticapi-0.1-py3-none-any.whl")
cityscapesscripts_dir = os.path.join(fname, "cityscapesScripts-2.1.7-py3-none-any.whl")
requirements_dir = os.path.join(project_dir, "requirements.txt")
output_dir = "/home/work/modelarts/outputs/train_output"
# config strings
evalpath = ''
MAX_ITER = 'SOLVER.MAX_ITER ' + str(args.iteration+90000)
BASE_LR = 'SOLVER.BASE_LR ' + str(args.learning_rate)
IMS_PER_BATCH = 'SOLVER.IMS_PER_BATCH ' + str(args.ims_per_batch)
SCRIPT_PATH = os.path.join(project_dir, "tools_d2/train_panoptic_deeplab.py")
CONFIG_PATH = os.path.join(fname, "configs/config.yaml")
CONFIG_CMD = '--config-file ' + CONFIG_PATH
EVAL_CMD = ''
GPU_CMD = ''
OPTS_CMD = MAX_ITER + ' ' + BASE_LR + ' ' + IMS_PER_BATCH
RESUME_CMD = ''
#functions
def merge_cmd(scirpt_path, config_cmd, gpu_cmd, eval_cmd, resume_cmd, opts_cmd):
return "python " + scirpt_path + " "+ config_cmd + " " + gpu_cmd + " " + eval_cmd + " " + resume_cmd + " " + OPTS_CMD
if args.eval == 'True':
assert args.load_weight, 'load_weight empty when trying to evaluate' # 如果评估时为空,则报错
if args.load_weight != 'trained_model/model/model_final.pth':
#将model拷贝到本地,并获取模型路径
modelpath, modelname = os.path.split(args.load_weight)
mox.file.copy_parallel(args.load_weight, os.path.join(fname, modelname))
evalpath = os.path.join(fname,modelname)
else:
evalpath = os.path.join(fname,'trained_model/model/model_final.pth')
EVAL_CMD = '--eval-only MODEL.WEIGHTS ' + evalpath
else:
GPU_CMD = '--num-gpus ' + str(args.num_gpus)
if args.load_weight:
RESUME_CMD = '--resume'
if args.load_weight != 'trained_model/model/model_final.pth':
modelpath, modelname = os.path.split(args.load_weight)
mox.file.copy_parallel(args.load_weight, os.path.join('/cache',modelname))
with open('/cache/last_checkpoint','w') as f: #创建last_checkpoint文件
f.write(modelname)
f.close()
else:
os.system('cp ' + os.path.join(fname, 'trained_model/model/model_final.pth') + ' /cache/model_final.pth')
with open('/cache/last_checkpoint','w') as f: #创建last_checkpoint文件
f.write('model_final.pth')
f.close()
os.environ['DETECTRON2_DATASETS'] = args.training_dataset #添加数据库路径环境变量
cmd = merge_cmd(SCRIPT_PATH, CONFIG_CMD, GPU_CMD, EVAL_CMD, RESUME_CMD, OPTS_CMD)
# os.system('mkdir -p ' + args.train_url)
print('*********Train Information*********')
print('Run Command: ' + cmd)
print('Num of GPUs: ' + str(args.num_gpus))
print('Evaluation: ' + args.eval)
if args.load_weight:
print('Load Weight: ' + args.load_weight)
else:
print('Load Weight: None (train from scratch)')
print('Iteration: ' + str(args.iteration))
print('Learning Rate: ' + str(args.learning_rate))
print('Images Per Batch: ' + str(args.ims_per_batch))