[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[HTCondor-users] setup to prioritize a single-job vs a whole-machine-job



Hi Condor experts,


I've been reading the HTCondor documentation but I don't think I could understand how this Condor's mechanism works.

I'm trying to set up a node to suspend a whole-machine-job-running when a single job wants to run on that node.

I'll be grateful if an expert can help me to setup this.


This is my condor_config.local for a whole-machine-job:

===================================================
     1	ALLOW_WRITE = $(HOSTALLOW_WRITE)
     2	#AMAZON_GAHP = $(SBIN)/amazon_gahp
     3	#AMAZON_GAHP_LOG = /tmp/AmazonGahpLog.$(USERNAME)
     4	COLLECTOR_NAME = Collector at ms01.cm.domain.example.com
     5	COLLECTOR_SOCKET_CACHE_SIZE = 1000
     6	CONDOR_ADMIN = condor@xxxxxxxxxxxxxxxxxxxxxxxxxx
     7	CONDOR_DEVELOPERS = NONE
     8	CONDOR_DEVELOPERS_COLLECTOR = NONE
     9	CONDOR_HOST = ms01.cm.domain.example.com
    10	CONDOR_IDS = 993.989
    11	CONDOR_SSHD = /usr/sbin/sshd
    12	CONDOR_SSH_KEYGEN = /usr/bin/ssh-keygen
    13	CONTINUE = True
    14	DAEMON_LIST = MASTER, STARTD
    15	EMAIL_DOMAIN = $(FULL_HOSTNAME)
    16	FILESYSTEM_DOMAIN = cm.domain.example.com
    17	HIGHPORT = 50000
    18	HOSTALLOW_WRITE = *.domain.example.com
    19	JAVA =
    20	KILL = False
    21	#LOCAL_DIR = /var/opt/condor
    22	#LOCK = /tmp/condor-lock.$(HOSTNAME)
    23	LOWPORT = 40000
    24	MAIL = /bin/mail
    25	NEGOTIATOR_INTERVAL = 120
    26	NETWORK_INTERFACE = ib0
    27	PREEMPT = False
    28	RANK = None
    29	#RELEASE_DIR = /opt/condor
    30	#SOAP_SSL_CA_FILE = /etc/pki/tls/cert.pem
    31	TRUST_UID_DOMAIN = True
    32	UID_DOMAIN = cm.domain.example.com
    33	START = True
    34	STARTD_EXPRS = $(STARTD_EXPRS)
    35	SUSPEND = False
    36	UPDATE_COLLECTOR_WITH_TCP = True
    37	WANT_SUSPEND = False
    38	WANT_VACATE = False
    39	
    40	#Modificacoes de 15/dez/2018
    41	# configuracao para que WholeMachine function no cluster corretamente
    42	START = ($(START)) && (TARGET.RequiresWholeMachine =!= TRUE || SlotID == 1)
    43	#
    44	# # have the machine advertise when it is running a whole-machine job
    45	STARTD_JOB_EXPRS = $(STARTD_JOB_EXPRS) RequiresWholeMachine
    46	
    47	# # Export the job expr to all other slots
    48	STARTD_SLOT_EXPRS = RequiresWholeMachine
    49	
    50	# # require that no single-cpu jobs may start when a whole-machine job is running
    51	START = ($(START)) && (SlotID == 1 || Slot1_RequiresWholeMachine =!= True)
    52	
    53	# JÃ existe na linha superior
    54	# # avoid suspending jobs
    55	#SUSPEND = FALSE
    56	
    57	# @cas em 13/dez/2018
    58	UNUSED_CLAIM_TIMEOUT = 600 
    59	
    60	#Ativar a configuraÃÃo do docker
    61	DOCKER = /usr/bin/docker

======================================================================================================

And this is what I'm trying to setup to suspend a whole-machine-job-running when a single job knocks the door of that node:

The lines 1-34 are the same.

======================================================================================================
    35	SUSPEND = True
    36	UPDATE_COLLECTOR_WITH_TCP = True
    37	WANT_SUSPEND = False
    38	WANT_VACATE = False
    39	
    40	#Modificacoes de 15/dez/2018
    41	# configuracao para que WholeMachine function no cluster corretamente
    42	START = ($(START)) && (TARGET.RequiresWholeMachine =!= TRUE || SlotID == 1)
    43	#
    44	# # have the machine advertise when it is running a whole-machine job
    45	STARTD_JOB_EXPRS = $(STARTD_JOB_EXPRS) RequiresWholeMachine
    46	
    47	# # Export the job expr to all other slots
    48	STARTD_SLOT_EXPRS = RequiresWholeMachine
    49	
    50	# # require that no single-cpu jobs may start when a whole-machine job is running
    51	#START = ($(START)) && (SlotID == 1 || Slot1_RequiresWholeMachine =!= True)
    52	
    53	# advertise the activity of each slot into the ads of the other slots,
    54	# so the SUSPEND expression can see it
    55	STARTD_SLOT_EXPRS = $(STARTD_SLOT_EXPRS) Activity
    56	
    57	# Suspend the whole-machine job until the other slots are empty
    58	SUSPEND = ($(SUSPEND)) || (SlotID == 1 && Slot1_RequiresWholeMachine =?= True && ( \
    59		Slot2_Activity =?= "Busy" || \
    60		Slot3_Activity =?= "Busy" || \
    61		Slot4_Activity =?= "Busy" || \
    62		Slot5_Activity =?= "Busy" || \
    63		Slot6_Activity =?= "Busy" || \
    64		Slot7_Activity =?= "Busy" || \
    65		Slot8_Activity =?= "Busy" || \
    66		Slot9_Activity =?= "Busy" || \
    67		Slot10_Activity =?= "Busy" || \
    68		Slot11_Activity =?= "Busy" || \
    69		Slot12_Activity =?= "Busy" ) )
    70	
    71	# 
    72	#NEGOTIATOR_PRE_JOB_RANK = -TARGET.LoadAvg*(MY.RequiresWholeMachine =?= True)
    73	
    74	# JÃ existe na linha superior
    75	# # avoid suspending jobs
    76	#SUSPEND = FALSE
    77	
    78	# @cas em 13/dez/2018
    79	UNUSED_CLAIM_TIMEOUT = 600 
    80	
    81	#Ativar a configuraÃÃo do docker
    82	DOCKER = /usr/bin/docker



cheers,

--
Carlos Adean
skype: carlosadean