Class: Mortar::Command::Spark

-c, –clusterid CLUSTERID # Run job on an existing cluster with ID of CLUSTERID (Default: runs on an existing available cluster) -s, –clustersize NUMNODES # Run job with NUMNODES nodes (optional; must be >= 2 if provided) -t, –clustertags A,B,C # Run job on an existing cluster with specified tags -3, –spot # Use spot instances for this cluster (Default: true) -B, –branch BRANCHNAME # Used with –project to specify a non-master branch

Examples:

Run a spark job:
    $ mortar spark com.datadog.some.Job

Run a spark job from master branch:
    $ mortar spark -B master com.datadog.some.Job

Run a spark job with some arguments:
    $ mortar spark com.datadog.some.Job --env prod s3://your-bucket/input s3://your-bucket/output 100

# File 'lib/mortar/command/spark.rb', line 47

def index
  class_name = shift_argument
  unless class_name
    error("Usage: mortar spark CLASS_NAME\nMust specify CLASS_NAME.")
  end

  project_name = project.name

  script_arguments = spark_script_arguments()

  if options[:branch]
    git_ref = options[:branch]
    gitlab_uri = "commits/#{git_ref}"
  else
    git_ref = sync_code_with_cloud()
    gitlab_uri = "commit/#{git_ref}"
  end

  if options[:clustertags]
    cluster_tags = options[:clustertags].split(',')
  else
    cluster_tags = []
  end

  unless options[:clusterid] || options[:clustersize]
    clusters = api.get_clusters(Mortar::API::Jobs::CLUSTER_BACKEND__EMR_SPARK_JOBSERVER).body['clusters']

    if cluster_tags.length > 0
      tagged_clusters = clusters.select{
        |c| c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING && (cluster_tags - c['tags']).empty?
      }
      if tagged_clusters.length > 1
        display(tagged_clusters)
        error "There're " + tagged_clusters.length + " clusters with tags [" + options[:clustertags] +
          "]. Please, select one cluster."
      elsif tagged_clusters.length == 0
        error "There're no clusters with tags [" + options[:clustertags] + "]"
      end

      largest_cluster = tagged_clusters.max_by{|c| c['size']}

      options[:clusterid] = largest_cluster['cluster_id']
      display("Running job on the cluster with tags [" + options[:clustertags] + "], id = " + largest_cluster['cluster_id'] +
        ", size = " + largest_cluster['size'].to_s)
    else
      largest_free_cluster = clusters.select{ |c| \
        c['running_jobs'].length == 0 && c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING }.
        max_by{|c| c['size']}

      if largest_free_cluster.nil?
        error('No running clusters with Spark Job Server detected, please, launch a SparkJobServer cluster first')
      end

      options[:clusterid] = largest_free_cluster['cluster_id']
      display("Defaulting to running job on largest existing free cluster, id = " +
              largest_free_cluster['cluster_id'] + ", size = " + largest_free_cluster['size'].to_s)
    end
  end

  response = action("Requesting job execution") do
    cluster_id = options[:clusterid]
    args = { script_arguments: script_arguments }
    args[:clustersize] = options[:clustersize] if options[:clustersize]
    api.post_spark_job_on_jobserver(project_name, class_name, git_ref, cluster_id, **args).body
  end
  
  display("job_id: #{response['job_id']}")
  display("git_ref: #{git_ref}")
  display
  display("Gitlab CI pipeline status can be viewed at:\n\n https://gitlab.ddbuild.io/DataDog/dd-analytics/#{gitlab_uri}")
  display
  display("Job status can be viewed on the web at:\n\n #{response['web_job_url']}")
  display

  response['job_id']
end

Class: Mortar::Command::Spark

Overview

Instance Attribute Summary

Attributes inherited from Base

Instance Method Summary collapse

Methods inherited from Base

Methods included from Helpers

Constructor Details

Instance Method Details

#index ⇒ Object

#index ⇒ `Object`