More often than not, one must cancel the execution of a parameter study on a SLURM cluster and kill a lot of jobs. This python script does it automagically for you.
#!/usr/bin/python
import sys
import os
import subprocess
def main():
"""Cancels all SLURM jobs in directories whose names match a pattern using
the latest SLURM file in the directory to figure out the job ID."""
casePattern = sys.argv[1]
dirNames = os.listdir(os.curdir)
dirNames = filter(lambda dirName : casePattern in dirName, dirNames)
for dirName in dirNames:
fileNames = os.listdir(os.path.join(os.curdir, dirName))
slurmFileNames = filter(lambda fileName : "slurm" in fileName, fileNames)
lastSlurmFileName = slurmFileNames[-1]
lastSlurmID = lastSlurmFileName.lstrip('slurm-').rstrip('.out')
print("Canceling %s " % lastSlurmID)
subprocess.call(["scancel", lastSlurmID])
if __name__ == "__main__":
main()
Call the script with a pattern of the directory names used for the parameter study, it will cancel all SLURM jobs using the latest slurm*.out file in each directory.