From 9fb4f908ad09d458d469b2b7f4d0ef82af167e10 Mon Sep 17 00:00:00 2001 From: Eugene Burmakin Date: Wed, 10 Dec 2025 21:21:43 +0100 Subject: [PATCH] Add actual verification of raw data archives after creation, and only clear raw_data for verified archives. --- app/services/points/raw_data/verifier.rb | 51 ++++++++++++++++--- .../services/points/raw_data/verifier_spec.rb | 18 +++++++ 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/app/services/points/raw_data/verifier.rb b/app/services/points/raw_data/verifier.rb index 0155bde5..833c1f38 100644 --- a/app/services/points/raw_data/verifier.rb +++ b/app/services/points/raw_data/verifier.rb @@ -87,13 +87,15 @@ module Points end end - # 5. Verify file can be decompressed and is valid JSONL + # 5. Verify file can be decompressed and is valid JSONL, extract data begin - point_ids = decompress_and_extract_point_ids(compressed_content) + archived_data = decompress_and_extract_data(compressed_content) rescue StandardError => e return { success: false, error: "Decompression/parsing failed: #{e.message}" } end + point_ids = archived_data.keys + # 6. Verify point count matches if point_ids.count != archive.point_count return { @@ -117,21 +119,58 @@ module Points } end + # 9. Verify archived raw_data matches current database raw_data + verification_result = verify_raw_data_matches(archived_data) + return verification_result unless verification_result[:success] + { success: true } end - def decompress_and_extract_point_ids(compressed_content) + def decompress_and_extract_data(compressed_content) io = StringIO.new(compressed_content) gz = Zlib::GzipReader.new(io) - point_ids = [] + archived_data = {} gz.each_line do |line| data = JSON.parse(line) - point_ids << data['id'] + archived_data[data['id']] = data['raw_data'] end gz.close - point_ids + archived_data + end + + def verify_raw_data_matches(archived_data) + # Sample verification: check random points to ensure archived data matches database + # For performance, we'll verify a sample rather than all points + sample_size = [archived_data.size, 100].min + point_ids_to_check = archived_data.keys.sample(sample_size) + + mismatches = [] + + Point.where(id: point_ids_to_check).find_each do |point| + archived_raw_data = archived_data[point.id] + current_raw_data = point.raw_data + + # Compare the raw_data (both should be hashes) + if archived_raw_data != current_raw_data + mismatches << { + point_id: point.id, + archived: archived_raw_data, + current: current_raw_data + } + end + end + + if mismatches.any? + return { + success: false, + error: "Raw data mismatch detected in #{mismatches.count} point(s). " \ + "First mismatch: Point #{mismatches.first[:point_id]}" + } + end + + { success: true } end def calculate_checksum(point_ids) diff --git a/spec/services/points/raw_data/verifier_spec.rb b/spec/services/points/raw_data/verifier_spec.rb index 9aa6901c..ed77ce55 100644 --- a/spec/services/points/raw_data/verifier_spec.rb +++ b/spec/services/points/raw_data/verifier_spec.rb @@ -69,6 +69,24 @@ RSpec.describe Points::RawData::Verifier do verifier.verify_specific_archive(archive.id) end.not_to change { archive.reload.verified_at } end + + it 'detects raw_data mismatch between archive and database' do + # Modify raw_data in database after archiving + points.first.update_column(:raw_data, { lon: 999.0, lat: 999.0 }) + + expect do + verifier.verify_specific_archive(archive.id) + end.not_to change { archive.reload.verified_at } + end + + it 'verifies raw_data matches between archive and database' do + # Ensure data hasn't changed + expect(points.first.raw_data).to eq({ 'lon' => 13.4, 'lat' => 52.5 }) + + verifier.verify_specific_archive(archive.id) + + expect(archive.reload.verified_at).to be_present + end end describe '#verify_month' do