Skip to content

Commit

Permalink
Do not try to delete runner if script already running
Browse files Browse the repository at this point in the history
We catch Octokit::Conflict exception with the "Already exists" message
to ensure the `register_runner` function is idempotent. If it's run a
second time, it fails with this exception.

Recently, we found a few examples with non-idempotent results. If this
label run again after the start of the script, it fails with the
following error:

    Failed to start transient service unit: Unit runner-script.service
    already exists

This PR fixes this issue by checking if the script is already running
before trying to delete the runner.
  • Loading branch information
enescakir committed Apr 30, 2024
1 parent 56a1ff2 commit 50cad1f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
29 changes: 20 additions & 9 deletions prog/vm/github_runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -299,22 +299,33 @@ def setup_info

hop_wait
rescue Octokit::Conflict => e
unless e.message.include?("Already exists")
raise e
end
raise e unless e.message.include?("Already exists")

# If the runner already exists at GitHub side, this suggests that the
# process terminated prematurely before start the runner script and hop wait.
# We need to locate the 'runner_id' using the name and delete it.
# After this, we can register the runner again.
# process terminated prematurely before hop wait. We can't be sure if the
# script was started or not without checking the runner status. We need to
# locate the runner using the name and decide delete or continue to wait.
runners = github_client.paginate("/repos/#{github_runner.repository_name}/actions/runners") do |data, last_response|
data[:runners].concat last_response.data[:runners]
end
unless (runner = runners[:runners].find { _1[:name] == github_runner.ubid.to_s })
fail "BUG: Failed with runner already exists error but couldn't find it"
end
Clog.emit("Deleting GithubRunner because it already exists") { {github_runner: github_runner.values.merge({runner_id: runner[:id]})} }
github_client.delete("/repos/#{github_runner.repository_name}/actions/runners/#{runner[:id]}")
nap 5

runner_id = runner.fetch(:id)
# If the runner script is not started yet, we can delete the runner and
# register it again.
if vm.sshable.cmd("systemctl show -p SubState --value #{SERVICE_NAME}").chomp == "dead"
Clog.emit("Deregistering runner because it already exists") { {github_runner: github_runner.values.merge({runner_id: runner_id})} }
github_client.delete("/repos/#{github_runner.repository_name}/actions/runners/#{runner_id}")
nap 5
end

# The runner script is already started. We persist the runner_id and allow
# wait label to decide the next step.
Clog.emit("The runner already exists but the runner script is started too") { {github_runner: github_runner.values.merge({runner_id: runner_id})} }
github_runner.update(runner_id: runner_id, ready_at: Time.now)
hop_wait
end

label def wait
Expand Down
21 changes: 17 additions & 4 deletions spec/prog/vm/github_runner_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -412,19 +412,32 @@
expect { nx.register_runner }.to hop("wait")
end

it "deletes the runner if the generate request fails due to 'already exists with the same name' error." do
it "deletes the runner if the generate request fails due to 'already exists with the same name' error and the runner script does not start yet." do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: github_runner.ubid.to_s, labels: [github_runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Already exists - A runner with the name *** already exists."}))
expect(client).to receive(:paginate)
.and_yield({runners: [{name: github_runner.ubid.to_s, id: 123}]}, instance_double(Sawyer::Response, data: {runners: []}))
.and_return({runners: [{name: github_runner.ubid.to_s, id: 123}]})
expect(sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("dead")
expect(client).to receive(:delete).with("/repos/#{github_runner.repository_name}/actions/runners/123")
expect(Clog).to receive(:emit).with("Deleting GithubRunner because it already exists").and_call_original
expect(Clog).to receive(:emit).with("Deregistering runner because it already exists").and_call_original
expect { nx.register_runner }.to nap(5)
end

it "naps if the generate request fails due to 'already exists with the same name' error but couldn't find the runner" do
it "hops to wait if the generate request fails due to 'already exists with the same name' error and the runner script is running" do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: github_runner.ubid.to_s, labels: [github_runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Already exists - A runner with the name *** already exists."}))
expect(client).to receive(:paginate)
.and_yield({runners: [{name: github_runner.ubid.to_s, id: 123}]}, instance_double(Sawyer::Response, data: {runners: []}))
.and_return({runners: [{name: github_runner.ubid.to_s, id: 123}]})
expect(sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect(github_runner).to receive(:update).with(runner_id: 123, ready_at: anything)
expect { nx.register_runner }.to hop("wait")
end

it "fails if the generate request fails due to 'already exists with the same name' error but couldn't find the runner" do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: github_runner.ubid.to_s, labels: [github_runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Already exists - A runner with the name *** already exists."}))
Expand All @@ -433,7 +446,7 @@
expect { nx.register_runner }.to raise_error RuntimeError, "BUG: Failed with runner already exists error but couldn't find it"
end

it "naps if the generate request fails due to 'Octokit::Conflict' but it's not already exists error" do
it "fails if the generate request fails due to 'Octokit::Conflict' but it's not already exists error" do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: github_runner.ubid.to_s, labels: [github_runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Another issue"}))
Expand Down

0 comments on commit 50cad1f

Please sign in to comment.