Rework map storage format
pirj opened this issue · comments
Phil Pirozhkov commented
Currently large profiling data sets consume a lot of RAM when loaded.
We should minimize memory consumption for our execution maps. So we should group shared "affected files" by context where possible.
E.g. for specs structured like this
RSpec.describe 'test' do
describe '#one' do
context "when two" do
it { is_expected.to eq 'a' }
....
end
end
end
We should have a map:
"test_spec.rb[1]": [files affected by all examples in describe "test"],
"test_spec.rb[1:1]": [files affected by all examples in describe "one" without ones listed in "test_spec.rb[1]"],
"test_spec.rb[1:1:1]": [files affected by all examples in context "when two" without ones listed in "test_spec.rb[1]" and "test_spec.rb[1:1]"],
"test_spec.rb[1:1:1:1]": [files affected by example "a" without ones listed in "test_spec.rb[1]", "test_spec.rb[1:1]" and "test_spec.rb[1:1:1]",
...
According to a modelling code done on a large codebase, it should make Crystalball data files ~4 times smaller.
class InvestigateNewApproach
def initialize(map_path = Pathname('../large-project/tmp/crystalball_data/execution_maps/'))
@map_path = map_path
end
def map
@map ||= Crystalball::MapStorage::YAMLStorage.load(@map_path)
end
def investigate
result = {}
file_based_cases.each do |filename, group|
result[filename] = calc_group_stats(filename, group.to_h)
unless result[filename][:check]
puts "ERROR!"
break
end
end
puts '================================TOTAL=============='
puts "Now #{result.values.map { |v| v[:old_approach] }.sum}, NEW: #{result.values.map { |v| v[:new_approach] }.sum}"
result
end
def convert!(resulting_map = 'converted_map.yml')
storage = Crystalball::MapStorage::YAMLStorage.new(Pathname(resulting_map))
map = Crystalball::ExecutionMap.new(metadata: {commit: nil, version: 10, timestamp: Time.now.to_i})
require 'ostruct'
file_based_cases.each do |filename, group|
files_grouped_by_contexts(group.to_h).each do |context, info|
e = OpenStruct.new(id: "#{filename}[#{context}]", file_path: filename)
map << Crystalball::ExampleGroupMap.new(e, info[:files]) if info[:files].size > 0
end
end
storage.dump(map.metadata.to_h)
storage.dump(map.cases.to_h)
end
def file_based_cases
@file_based_cases ||= map.cases.group_by { |k, _v| example_filename(k) }
end
def example_filename(e)
e.split('[').first
end
def get_all_contexts(example_names)
get_contexts = lambda { |e|
numbers = e.split('[').last.split(':')[0..-2]
result = []
until numbers.empty?
result << numbers.join(':')
numbers.pop
end
result
}
example_names.map { |e| get_contexts.call(e) }.compact.flatten.sort_by { |v| v.split(':').size }
end
def parent_context(context)
context.split(':')[0..-2].join(':')
end
def calc_group_stats(filename, group)
context_files = files_grouped_by_contexts(group)
example_uniq = {}
group.each do |key, value|
example_uniq[key] = value
get_all_contexts([key]).each do |c|
example_uniq[key] -= context_files[c][:files]
end
end
values = group.values
file_result = {
old_approach: values.map(&:size).sum,
example_count: values.size,
contexts: context_files,
example_uniq: example_uniq
}
file_result[:new_approach] = file_result[:old_approach]
file_result[:contexts].each do |_, value|
file_result[:new_approach] -= (value[:examples_count] - 1) * value[:files].size
end
puts "#{filename}. #{(file_result[:new_approach].to_f * 100 / file_result[:old_approach]).round(2)} Now: #{file_result[:old_approach]}, Examples: #{file_result[:example_count]}, Example_Uniq: #{file_result[:example_uniq].values.inject(0) { |sum, v| sum + v.size }}, New: #{file_result[:new_approach]}"
# Check
check = file_result[:contexts].values.inject(0) do |sum, value|
sum + value[:files].size * value[:examples_count]
end + example_uniq.values.inject(0) do |sum, value|
sum + value.size
end
file_result[:check] = extended_check(file_result, group)
unless file_result[:check]
puts "Check did not pass!"
puts "Contexts #{file_result[:contexts].map { |k, v| [k, "#{v[:examples_count]}*#{v[:files].size}"] }.to_h}"
end
file_result
end
def files_grouped_by_contexts(group)
contexts = get_all_contexts(group.keys)
context_files = {}
# puts "============RAW MAP======="
# puts group.to_yaml
# puts "============RAW MAP======="
all_parent_files = lambda do |context|
parent_c = parent_context(context)
if parent_c.empty?
return context_files[context][:files]
else
return context_files[context][:files] + all_parent_files.call(parent_c)
end
end
contexts.each do |c|
current_context_results = {examples_count: 0}
group.each do |key, value|
next unless key.include?("[#{c}")
current_context_results[:examples_count] += 1
if current_context_results[:files].nil?
current_context_results[:files] = value
parent_c = parent_context(c)
current_context_results[:files] -= all_parent_files.call(parent_c) unless parent_c.empty?
else
current_context_results[:files] = current_context_results[:files] & value
end
end
context_files[c] = current_context_results
end
context_files
end
def extended_check(file_result, group)
result = true
group.each do |e, values|
sum = file_result[:example_uniq][e].size
get_all_contexts([e]).each do |con|
sum += file_result[:contexts][con][:files].size
end
if sum != values.size
puts "#{e} #{sum} != #{values.size}"
result = false
end
end
result
end
end
Pavel Shutsin commented
You can check this #103 it might help you.
Phil Pirozhkov commented
Thanks, @pluff !
Do you think that pull request covers it all, and this issue may be resolved? Or is there anything left to improve?
Pavel Shutsin commented
only if you have any improvements suggestions