Source code for mechanoChemML.workflows.active_learning.slurm_manager

#!/usr/bin/env python

import numpy as np
from subprocess import check_output, STDOUT
import os
from time import sleep

[docs]def numCurrentJobs(name): try: num = len(check_output(['squeue','-n',name],stderr=STDOUT).decode("utf-8").split('\n'))-2 except: num = 1 print('Error with numCurrentJobs: ',check_output(['squeue','-n',name],stderr=STDOUT).decode("utf-8")) return num
[docs]def checkPending(name): try: val = False jobs = check_output(['squeue','-n',name],stderr=STDOUT).decode("utf-8").split('\n') for job in jobs: if 'PD' in job: val = True break except: val = False print('Error with check pending: ',check_output(['squeue','-n',name],stderr=STDOUT).decode("utf-8")) return val
[docs]def submitJob(command,specs={},is_dnsml=False): # Default values for LSF job script default = {'wall_time':'12:00:00', 'nodes':1, 'ntasks-per-node':1, 'total_memory':'1G', 'job_name':'default', 'output_folder':'.', 'queue':'shared'} # Incoporate any changes to the defaults default.update(specs) # Write out job script with open('submit.slrm','w') as fout: fout.write('#!/bin/bash\n#\n') fout.write("#SBATCH -t {} # wall time\n".format(default['wall_time'])) if 'account' in default: fout.write("#SBATCH -A {} # account\n".format(default['account'])) fout.write("#SBATCH --nodes {} \n".format(default['nodes'])) fout.write("#SBATCH --ntasks-per-node={} \n".format(default['ntasks-per-node'])) fout.write("#SBATCH --mem={}\n".format(default['total_memory'])) fout.write('#SBATCH -J {} # job name\n'.format(default["job_name"])) if 'gpu' in default['queue']: fout.write('#SBATCH --gpus=1') if 'array' in default: fout.write("#SBATCH --array={} # job array\n".format(default['array'])) fout.write('#SBATCH -e {}/errors.%J # error file name in which %J is replaced by the job ID \n'.format(default["output_folder"])) fout.write('#SBATCH -o {}/output.%J # output file name in which %J is replaced by the job ID\n'.format(default["output_folder"])) fout.write('#SBATCH -p {} # choose the queue (partition) to use\n\n'.format(default["queue"])) fout.write('#SBATCH --export=ALL\n\n') if isinstance(command, list): for item in command: fout.write(item) fout.write('\n') else: fout.write(command) # Submit script os.system('sbatch submit.slrm')
[docs]def waitForAll(name,interval=15): sleep(2*interval) while ( numCurrentJobs(name) > 0 or checkPending(name) ): sleep(interval)