Class: OodCore::Job::Adapters::HTCondor

Inherits:
OodCore::Job::Adapter show all
Defined in:
lib/ood_core/job/adapters/htcondor.rb

Overview

An adapter object that describes the communication with an HTCondor resource manager for job management.

Defined Under Namespace

Classes: Batch

Constant Summary collapse

STATUS_MAP =

Map HTCondor job statuses to symbols

{
    "1" => :queued,
    "2" => :running,
    "3" => :running,
    "4" => :completed,
    "5" => :queued_held,
    "6" => :running,
    "7" => :suspended
}.freeze

Instance Method Summary collapse

Methods inherited from OodCore::Job::Adapter

#directive_prefix, #info_all_each, #info_historic, #info_where_owner, #info_where_owner_each, #job_name_illegal_chars, #nodes, #queues, #sanitize_job_name

Constructor Details

#initialize(opts = {}) ⇒ HTCondor

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Returns a new instance of HTCondor.

Parameters:

  • opts (#to_h) (defaults to: {})

    the options defining this adapter

Options Hash (opts):

  • :htcondor (Batch)

    The HTCondor batch object

See Also:



303
304
305
306
307
# File 'lib/ood_core/job/adapters/htcondor.rb', line 303

def initialize(opts = {})
    o = opts.to_h.symbolize_keys

    @htcondor = o.fetch(:htcondor) { raise ArgumentError, "No HTCondor object specified. Missing argument: htcondor" }
end

Instance Method Details

#accountsArray<AccountInfo>

Retrieve the relevant groups for the current user

Returns:

  • (Array<AccountInfo>)

    list of groups for the current user



505
506
507
508
509
510
511
# File 'lib/ood_core/job/adapters/htcondor.rb', line 505

def accounts
    username = Etc.getlogin
    groups = @htcondor.get_accounts[username]
    (groups)
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#cluster_infoHash

Retrieve cluster status information

Returns:

  • (Hash)

    summary of cluster status including active and total nodes, processors, GPUs, etc.

Raises:



451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
# File 'lib/ood_core/job/adapters/htcondor.rb', line 451

def cluster_info
    slots = @htcondor.get_slots
    active_nodes = slots.count { |slot| slot[:num_dynamic_slots] > 0 }
    total_nodes = slots.map { |slot| slot[:machine] }.uniq.count
    active_processors = slots.sum { |slot| slot[:total_cpus] - slot[:cpus] }
    total_processors = slots.sum { |slot| slot[:total_cpus] }
    active_gpus = slots.sum { |slot| slot[:total_gpus] - slot[:gpus] }
    total_gpus = slots.sum { |slot| slot[:total_gpus] }

    ClusterInfo.new({
        active_nodes: active_nodes,
        total_nodes: total_nodes,
        active_processors: active_processors,
        total_processors: total_processors,
        active_gpus: active_gpus,
        total_gpus: total_gpus
    })
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#delete(id) ⇒ Object

Delete a job

Parameters:

  • id (#to_s)

    the id of the job

Raises:



497
498
499
500
501
# File 'lib/ood_core/job/adapters/htcondor.rb', line 497

def delete(id)
    @htcondor.remove_job(id)
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#hold(id) ⇒ Object

Place a job on hold

Parameters:

  • id (#to_s)

    the id of the job

Raises:



480
481
482
483
484
# File 'lib/ood_core/job/adapters/htcondor.rb', line 480

def hold(id)
    @htcondor.hold_job(id)
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#info(id) ⇒ Info

Retrieve job info from the resource manager

Parameters:

  • id (#to_s)

    the id of the job

Returns:

  • (Info)

    information describing submitted job

Raises:



418
419
420
421
422
423
424
# File 'lib/ood_core/job/adapters/htcondor.rb', line 418

def info(id)
    id = id.to_s
    jobs = @htcondor.get_jobs(id: id)
    jobs.empty? ? Info.new(id: id, status: :completed) : parse_job_info(jobs.first)
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#info_all(attrs: nil) ⇒ Array<Info>

Retrieve information for all jobs

Returns:

  • (Array<Info>)

    list of information describing submitted jobs

Raises:



429
430
431
432
433
434
# File 'lib/ood_core/job/adapters/htcondor.rb', line 429

def info_all(attrs: nil)
    jobs = @htcondor.get_jobs
    jobs.map { |job| parse_job_info(job) }
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#release(id) ⇒ Object

Release a job from hold

Parameters:

  • id (#to_s)

    the id of the job

Raises:



489
490
491
492
493
# File 'lib/ood_core/job/adapters/htcondor.rb', line 489

def release(id)
    @htcondor.release_job(id)
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#status(id) ⇒ Symbol

Retrieve the status of a job

Parameters:

  • id (#to_s)

    the id of the job

Returns:

  • (Symbol)

    the status of the job

Raises:



440
441
442
443
444
445
446
# File 'lib/ood_core/job/adapters/htcondor.rb', line 440

def status(id)
    id = id.to_s
    jobs = @htcondor.get_jobs(id: id)
    jobs.empty? ? :completed : get_state(jobs.first[:status])
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#submit(script) ⇒ String

Submit a job with the attributes defined in the job template instance

Parameters:

  • script (Script)

    script object that describes the script and attributes for the submitted job

Returns:

  • (String)

    the job id returned after successfully submitting a job

Raises:



315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# File 'lib/ood_core/job/adapters/htcondor.rb', line 315

def submit(script)
    args = []
    args.concat ["-batch-name", "#{script.job_name}"] unless script.job_name.nil?
    args.concat ["-name", "#{script.queue_name}"] unless script.queue_name.nil?
    args.concat ["-a", "priority=#{script.priority}"] unless script.priority.nil?
    args.concat ["-a", "accounting_group=#{script.accounting_id}"] unless script.accounting_id.nil?

    args.concat ["-a", "submit_as_hold=#{script.hold}"] unless script.submit_as_hold.nil?
    args.concat ["-a", "max_retries=0"] unless !script.rerunnable.nil? && script.rerunnable

    args.concat ["-a", "allowed_execute_duration=#{script.wall_time}"] unless script.wall_time.nil?
    args.concat ["-a", "periodic_remove='HoldReasonCode == 47'"] unless script.wall_time.nil?
    args.concat ["-a", "deferral_time=#{script.start_time.tv_sec}"] unless script.start_time.nil?

    args.concat ["-a", "request_cpus=#{script.cores}"] unless script.cores.nil?
    # requesting 1GB of memory per core seems reasonable
    args.concat ["-a", "request_memory=#{script.cores * 1024}"] unless script.native.include?(:request_memory) && !script.native[:request_memory].nil?
    args.concat ["-a", "request_gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?

    universe = script.native[:universe] || @htcondor.default_universe
    args.concat ["-a", "universe=#{universe}"]
    container_image = script.native[:docker_image] || @htcondor.default_docker_image
    if universe == "docker" then
        args.concat ["-a", "docker_image=#{@htcondor.default_docker_image}"] unless script.native.include?(:docker_image) && !script.native[:docker_image].nil?
    elsif universe == "container" then
        script.native.delete(:docker_image) unless !script.native.include?(:docker_image)
        script.native[:container_image] = container_image
    end

    args.concat ["-a", "input=#{script.input_path}"] unless script.input_path.nil?
    if script.output_path.nil? then args.concat ["-a", "output=output.txt"] else args.concat ["-a", "output=#{script.output_path}"] end
    if script.error_path.nil? then args.concat ["-a", "error=error.txt"] else args.concat ["-a", "error=#{script.error_path}"] end
    if script.workdir.nil? then args.concat ["-a", "log=job.log"] else args.concat ["-a", "log=#{script.workdir}/job.log"] end

    args.concat ["-a", "initialdir=#{script.workdir}"] unless script.workdir.nil?
    args.concat ["-a", "\"environment=\\\"#{script.job_environment.to_a.map { |k, v| "#{k}='#{v.gsub("'", "''").gsub('"', "\\\"\\\"")}'" }.join(' ')}\\\"\""] unless script.job_environment.nil? || script.job_environment.empty?
    args.concat ["-a", "getenv=#{script.copy_environment}"] unless script.copy_environment.nil?

    args.concat ["-a", "should_transfer_files=true"]
    args.concat ["-a", "+OpenOnDemand=true"]

    # send email when started / terminated
    if script.email_on_started && script.email_on_terminated then
        raise JobAdapterError, "Cannot handle both email_on_started and email_on_terminated set to true" if script.email_on_started && script.email_on_terminated
        # args.concat ["-a", "notification=Always"] # might be supported in the future?
    elsif script.email_on_started then
        if @htcondor.version >= Gem::Version.new("24.10.0") then
            args.concat ["-a", "notification=Start"]
        else
            raise JobAdapterError, "Email notification on job start is not supported by this HTCondor version. Please upgrade to 24.10.0 or later."
        end
    elsif script.email_on_terminated then
        args.concat ["-a", "notification=Complete"]
    else
        args.concat ["-a", "notification=Never"]
    end
    args.concat ["-a", "notify_user=#{script.email}"] unless script.email.nil?

    args.concat @htcondor.additional_attributes.to_a.map { |k, v| "-a #{k}=#{v}" } unless @htcondor.additional_attributes.nil? || @htcondor.additional_attributes.empty?
    args.concat script.native.to_a.map { |k, v| "-a #{k}=#{v}" } unless script.native.nil? || script.native.empty?

    content = script.content

    # Set executable to some shell to execute the script
    if script.shell_path.nil?
        args.concat ["-a", "executable=/bin/bash"]
    else
        args.concat ["-a", "executable=#{script.shell_path}"]
    end

    # terse to shut up the output, - to get the script arguments from stdin.
    args.concat ["-terse", "-"]

    if script.job_array_request.nil?
        # If no job array request is specified, we submit a single job
        args.concat ["-queue", "1"]
    else
        # If a job array request is specified, we submit a job array
        # The job array request is expected to be a string like "1-10" or "1,2,3"
        # we must convert 1-3 to 1,2,3.
        if script.job_array_request.include?("-")
            start, finish = script.job_array_request.split("-").map(&:to_i)
            job_ids = (start..finish).to_a.join(",")
        else
            job_ids = script.job_array_request
        end
        # Generate multiple jobs in the job array by setting OODArrayId to the requested array ids
        # While -queue 10 would generate 10 jobs, the ProcId would always be 0-9, not 1-10 - or whatever the request is.
        # So we set the OODArrayId to the requested job ids.
        args.concat ["-queue", "1", "+OODArrayId", "in", job_ids.to_s]
    end

    script_args = script.args || []

    @htcondor.submit_string(args: args, script_args: script_args, script: content)
rescue Batch::Error => e
    raise JobAdapterError, e.message
end

#supports_job_arrays?Boolean

Indicate that the job adapter supports job arrays

Returns:

  • (Boolean)


473
474
475
# File 'lib/ood_core/job/adapters/htcondor.rb', line 473

def supports_job_arrays?
    true
end