Class: Mortar::Command::Spark
- Includes:
- Git
- Defined in:
- lib/mortar/command/spark.rb
Overview
run spark jobs using Spark Job Server
Instance Attribute Summary
Attributes inherited from Base
Instance Method Summary collapse
-
#index ⇒ Object
spark CLASS_NAME.
Methods inherited from Base
#api, #ask_public, #config_parameters, #get_error_message_context, #git, #initialize, #initialize_embedded_project, #luigi_parameters, namespace, #pig_parameters, #project, #register_api_call, #register_do, #register_project, #spark_script_arguments, #validate_project_name, #validate_project_structure
Methods included from Helpers
#action, #ask, #confirm, #copy_if_not_present_at_dest, #default_host, #deprecate, #display, #display_header, #display_object, #display_row, #display_table, #display_with_indent, #download_to_file, #ensure_dir_exists, #error, error_with_failure, error_with_failure=, extended, extended_into, #format_bytes, #format_date, #format_with_bang, #full_host, #get_terminal_environment, #home_directory, #host, #hprint, #hputs, included, included_into, #installed_with_omnibus?, #json_decode, #json_encode, #line_formatter, #longest, #output_with_bang, #pending_github_team_state_message, #quantify, #redisplay, #retry_on_exception, #running_on_a_mac?, #running_on_windows?, #set_buffer, #shell, #spinner, #status, #string_distance, #styled_array, #styled_error, #styled_hash, #styled_header, #suggestion, #test_name, #ticking, #time_ago, #truncate, #warning, #with_tty, #write_to_file
Constructor Details
This class inherits a constructor from Mortar::Command::Base
Instance Method Details
#index ⇒ Object
spark CLASS_NAME
Run a spark job on a spark jobserver.
-c, –clusterid CLUSTERID # Run job on an existing cluster with ID of CLUSTERID (Default: runs on an existing available cluster) -s, –clustersize NUMNODES # Run job with NUMNODES nodes (optional; must be >= 2 if provided) -t, –clustertags A,B,C # Run job on an existing cluster with specified tags -3, –spot # Use spot instances for this cluster (Default: true) -B, –branch BRANCHNAME # Used with –project to specify a non-master branch
Examples:
Run a spark job:
$ mortar spark com.datadog.some.Job
Run a spark job from master branch:
$ mortar spark -B master com.datadog.some.Job
Run a spark job with some arguments:
$ mortar spark com.datadog.some.Job --env prod s3://your-bucket/input s3://your-bucket/output 100
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/mortar/command/spark.rb', line 47 def index class_name = shift_argument unless class_name error("Usage: mortar spark CLASS_NAME\nMust specify CLASS_NAME.") end project_name = project.name script_arguments = spark_script_arguments() if [:branch] git_ref = [:branch] gitlab_uri = "commits/#{git_ref}" else git_ref = sync_code_with_cloud() gitlab_uri = "commit/#{git_ref}" end if [:clustertags] = [:clustertags].split(',') else = [] end unless [:clusterid] || [:clustersize] clusters = api.get_clusters(Mortar::API::Jobs::CLUSTER_BACKEND__EMR_SPARK_JOBSERVER).body['clusters'] if .length > 0 tagged_clusters = clusters.select{ |c| c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING && ( - c['tags']).empty? } if tagged_clusters.length > 1 display(tagged_clusters) error "There're " + tagged_clusters.length + " clusters with tags [" + [:clustertags] + "]. Please, select one cluster." elsif tagged_clusters.length == 0 error "There're no clusters with tags [" + [:clustertags] + "]" end largest_cluster = tagged_clusters.max_by{|c| c['size']} [:clusterid] = largest_cluster['cluster_id'] display("Running job on the cluster with tags [" + [:clustertags] + "], id = " + largest_cluster['cluster_id'] + ", size = " + largest_cluster['size'].to_s) else largest_free_cluster = clusters.select{ |c| \ c['running_jobs'].length == 0 && c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING }. max_by{|c| c['size']} if largest_free_cluster.nil? error('No running clusters with Spark Job Server detected, please, launch a SparkJobServer cluster first') end [:clusterid] = largest_free_cluster['cluster_id'] display("Defaulting to running job on largest existing free cluster, id = " + largest_free_cluster['cluster_id'] + ", size = " + largest_free_cluster['size'].to_s) end end response = action("Requesting job execution") do cluster_id = [:clusterid] args = { script_arguments: script_arguments } args[:clustersize] = [:clustersize] if [:clustersize] api.post_spark_job_on_jobserver(project_name, class_name, git_ref, cluster_id, **args).body end display("job_id: #{response['job_id']}") display("git_ref: #{git_ref}") display display("Gitlab CI pipeline status can be viewed at:\n\n https://gitlab.ddbuild.io/DataDog/dd-analytics/#{gitlab_uri}") display display("Job status can be viewed on the web at:\n\n #{response['web_job_url']}") display response['job_id'] end |