Add actual verification of raw data archives after creation, and only clear raw_data for verified archives.

This commit is contained in:
Eugene Burmakin 2025-12-10 21:21:43 +01:00
parent 393619051a
commit 9fb4f908ad
2 changed files with 63 additions and 6 deletions

View file

@ -87,13 +87,15 @@ module Points
end
end
# 5. Verify file can be decompressed and is valid JSONL
# 5. Verify file can be decompressed and is valid JSONL, extract data
begin
point_ids = decompress_and_extract_point_ids(compressed_content)
archived_data = decompress_and_extract_data(compressed_content)
rescue StandardError => e
return { success: false, error: "Decompression/parsing failed: #{e.message}" }
end
point_ids = archived_data.keys
# 6. Verify point count matches
if point_ids.count != archive.point_count
return {
@ -117,21 +119,58 @@ module Points
}
end
# 9. Verify archived raw_data matches current database raw_data
verification_result = verify_raw_data_matches(archived_data)
return verification_result unless verification_result[:success]
{ success: true }
end
def decompress_and_extract_point_ids(compressed_content)
def decompress_and_extract_data(compressed_content)
io = StringIO.new(compressed_content)
gz = Zlib::GzipReader.new(io)
point_ids = []
archived_data = {}
gz.each_line do |line|
data = JSON.parse(line)
point_ids << data['id']
archived_data[data['id']] = data['raw_data']
end
gz.close
point_ids
archived_data
end
def verify_raw_data_matches(archived_data)
# Sample verification: check random points to ensure archived data matches database
# For performance, we'll verify a sample rather than all points
sample_size = [archived_data.size, 100].min
point_ids_to_check = archived_data.keys.sample(sample_size)
mismatches = []
Point.where(id: point_ids_to_check).find_each do |point|
archived_raw_data = archived_data[point.id]
current_raw_data = point.raw_data
# Compare the raw_data (both should be hashes)
if archived_raw_data != current_raw_data
mismatches << {
point_id: point.id,
archived: archived_raw_data,
current: current_raw_data
}
end
end
if mismatches.any?
return {
success: false,
error: "Raw data mismatch detected in #{mismatches.count} point(s). " \
"First mismatch: Point #{mismatches.first[:point_id]}"
}
end
{ success: true }
end
def calculate_checksum(point_ids)

View file

@ -69,6 +69,24 @@ RSpec.describe Points::RawData::Verifier do
verifier.verify_specific_archive(archive.id)
end.not_to change { archive.reload.verified_at }
end
it 'detects raw_data mismatch between archive and database' do
# Modify raw_data in database after archiving
points.first.update_column(:raw_data, { lon: 999.0, lat: 999.0 })
expect do
verifier.verify_specific_archive(archive.id)
end.not_to change { archive.reload.verified_at }
end
it 'verifies raw_data matches between archive and database' do
# Ensure data hasn't changed
expect(points.first.raw_data).to eq({ 'lon' => 13.4, 'lat' => 52.5 })
verifier.verify_specific_archive(archive.id)
expect(archive.reload.verified_at).to be_present
end
end
describe '#verify_month' do