| ... | @@ -21,14 +21,46 @@ module Gitlab |
... | @@ -21,14 +21,46 @@ module Gitlab |
|
|
# In case not set, default to 300M. This is for extra-safe.
|
|
# In case not set, default to 300M. This is for extra-safe.
|
|
|
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
|
|
DEFAULT_MAX_MEMORY_GROWTH_KB = 300_000
|
|
|
|
|
|
|
|
|
# Phases of memory killer
|
|
|
|
PHASE = {
|
|
|
|
running: 1,
|
|
|
|
above_soft_limit: 2,
|
|
|
|
stop_fetching_new_jobs: 3,
|
|
|
|
shutting_down: 4,
|
|
|
|
killing_sidekiq: 5
|
|
|
|
}.freeze
|
|
|
|
|
|
|
def initialize
|
|
def initialize
|
|
|
super
|
|
super
|
|
|
|
|
|
|
|
@enabled = true
|
|
@enabled = true
|
|
|
|
@metrics = init_metrics
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
private
|
|
private
|
|
|
|
|
|
|
|
|
def init_metrics
|
|
|
|
{
|
|
|
|
sidekiq_current_rss: ::Gitlab::Metrics.gauge(:sidekiq_current_rss, 'Current RSS of Sidekiq Worker'),
|
|
|
|
sidekiq_memory_killer_soft_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_soft_limit_rss, 'Current soft_limit_rss of Sidekiq Worker'),
|
|
|
|
sidekiq_memory_killer_hard_limit_rss: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_hard_limit_rss, 'Current hard_limit_rss of Sidekiq Worker'),
|
|
|
|
sidekiq_memory_killer_phase: ::Gitlab::Metrics.gauge(:sidekiq_memory_killer_phase, 'Current phase of Sidekiq Worker')
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
|
|
|
def refresh_state(phase)
|
|
|
|
@phase = PHASE.fetch(phase)
|
|
|
|
@current_rss = get_rss
|
|
|
|
@soft_limit_rss = get_soft_limit_rss
|
|
|
|
@hard_limit_rss = get_hard_limit_rss
|
|
|
|
|
|
|
|
# track the current state as prometheus gauges
|
|
|
|
@metrics[:sidekiq_memory_killer_phase].set({}, @phase)
|
|
|
|
@metrics[:sidekiq_current_rss].set({}, @current_rss)
|
|
|
|
@metrics[:sidekiq_memory_killer_soft_limit_rss].set({}, @soft_limit_rss)
|
|
|
|
@metrics[:sidekiq_memory_killer_hard_limit_rss].set({}, @hard_limit_rss)
|
|
|
|
end
|
|
|
|
|
|
|
def run_thread
|
|
def run_thread
|
|
|
Sidekiq.logger.info(
|
|
Sidekiq.logger.info(
|
|
|
class: self.class.to_s,
|
|
class: self.class.to_s,
|
| ... | @@ -77,41 +109,51 @@ module Gitlab |
... | @@ -77,41 +109,51 @@ module Gitlab |
|
|
# Tell Sidekiq to stop fetching new jobs
|
|
# Tell Sidekiq to stop fetching new jobs
|
|
|
# We first SIGNAL and then wait given time
|
|
# We first SIGNAL and then wait given time
|
|
|
# We also monitor a number of running jobs and allow to restart early
|
|
# We also monitor a number of running jobs and allow to restart early
|
|
|
|
refresh_state(:stop_fetching_new_jobs)
|
|
|
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
|
|
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
|
|
|
return unless enabled?
|
|
return unless enabled?
|
|
|
|
|
|
|
|
# Tell sidekiq to restart itself
|
|
# Tell sidekiq to restart itself
|
|
|
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
|
|
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
|
|
|
|
refresh_state(:shutting_down)
|
|
|
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
|
|
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
|
|
|
return unless enabled?
|
|
return unless enabled?
|
|
|
|
|
|
|
|
# Ideally we should never reach this condition
|
|
# Ideally we should never reach this condition
|
|
|
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
|
|
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
|
|
|
# Kill the whole pgroup, so we can be sure no children are left behind
|
|
# Kill the whole pgroup, so we can be sure no children are left behind
|
|
|
|
refresh_state(:killing_sidekiq)
|
|
|
signal_pgroup('SIGKILL', 'die')
|
|
signal_pgroup('SIGKILL', 'die')
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
def rss_within_range?
|
|
def rss_within_range?
|
|
|
current_rss = nil
|
|
refresh_state(:running)
|
|
|
|
|
|
|
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
|
|
deadline = Gitlab::Metrics::System.monotonic_time + GRACE_BALLOON_SECONDS.seconds
|
|
|
loop do
|
|
loop do
|
|
|
return true unless enabled?
|
|
return true unless enabled?
|
|
|
|
|
|
|
|
current_rss = get_rss
|
|
|
|
|
|
|
|
|
|
# RSS go above hard limit should trigger forcible shutdown right away
|
|
# RSS go above hard limit should trigger forcible shutdown right away
|
|
|
break if current_rss > hard_limit_rss
|
|
break if @current_rss > @hard_limit_rss
|
|
|
|
|
|
|
|
# RSS go below the soft limit
|
|
# RSS go below the soft limit
|
|
|
return true if current_rss < soft_limit_rss
|
|
return true if @current_rss < @soft_limit_rss
|
|
|
|
|
|
|
|
# RSS did not go below the soft limit within deadline, restart
|
|
# RSS did not go below the soft limit within deadline, restart
|
|
|
break if Gitlab::Metrics::System.monotonic_time > deadline
|
|
break if Gitlab::Metrics::System.monotonic_time > deadline
|
|
|
|
|
|
|
|
sleep(CHECK_INTERVAL_SECONDS)
|
|
sleep(CHECK_INTERVAL_SECONDS)
|
|
|
|
|
|
|
|
refresh_state(:above_soft_limit)
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
|
|
# There are two chances to break from loop:
|
|
|
|
# - above hard limit, or
|
|
|
|
# - above soft limit after deadline
|
|
|
|
# When `above hard limit`, it immediately go to `stop_fetching_new_jobs`
|
|
|
|
# So ignore `above hard limit` and always set `above_soft_limit` here
|
|
|
|
refresh_state(:above_soft_limit)
|
|
|
|
log_rss_out_of_range(@current_rss, @hard_limit_rss, @soft_limit_rss)
|
|
|
|
|
|
|
|
false
|
|
false
|
|
|
end
|
|
end
|
| ... | @@ -143,11 +185,11 @@ module Gitlab |
... | @@ -143,11 +185,11 @@ module Gitlab |
|
|
output.to_i
|
|
output.to_i
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
def soft_limit_rss
|
|
def get_soft_limit_rss
|
|
|
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
|
|
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
def hard_limit_rss
|
|
def get_hard_limit_rss
|
|
|
HARD_LIMIT_RSS_KB
|
|
HARD_LIMIT_RSS_KB
|
|
|
end
|
|
end
|
|
|
|
|
|
| ... | |
... | |
| ... | | ... | |