Raw data archivation

This commit is contained in:
Eugene Burmakin 2025-12-06 22:43:22 +01:00
parent 44d8aee468
commit 9d93e5df7c
15 changed files with 3055 additions and 1 deletions

2259
RAW_DATA_ARCHIVAL_PLAN.md Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,19 @@
# frozen_string_literal: true
module Points
module RawData
class ArchiveJob < ApplicationJob
queue_as :default
def perform
stats = Points::RawData::Archiver.new.call
Rails.logger.info("Archive job complete: #{stats}")
rescue StandardError => e
Rails.logger.error("Archive job failed: #{e.message}")
Sentry.capture_exception(e) if defined?(Sentry)
raise
end
end
end
end

View file

@ -0,0 +1,19 @@
# frozen_string_literal: true
module Points
module RawData
class ReArchiveMonthJob < ApplicationJob
queue_as :default
def perform(user_id, year, month)
Rails.logger.info("Re-archiving #{user_id}/#{year}/#{month} (retrospective import)")
Points::RawData::Archiver.new.archive_specific_month(user_id, year, month)
rescue StandardError => e
Rails.logger.error("Re-archive failed: #{e.message}")
Sentry.capture_exception(e) if defined?(Sentry)
raise
end
end
end
end

View file

@ -0,0 +1,100 @@
# frozen_string_literal: true
module Archivable
extend ActiveSupport::Concern
included do
# Associations
belongs_to :raw_data_archive,
class_name: 'Points::RawDataArchive',
foreign_key: :raw_data_archive_id,
optional: true
# Scopes
scope :archived, -> { where(raw_data_archived: true) }
scope :not_archived, -> { where(raw_data_archived: false) }
scope :with_archived_raw_data, -> {
includes(raw_data_archive: { file_attachment: :blob })
}
end
# Main method: Get raw_data with fallback to archive
# Use this instead of point.raw_data when you need archived data
def raw_data_with_archive
# If raw_data is present in DB, use it
return raw_data if raw_data.present? || !raw_data_archived?
# Otherwise fetch from archive
fetch_archived_raw_data
end
# Alias for convenience (optional)
alias_method :archived_raw_data, :raw_data_with_archive
# Restore archived data back to database column
def restore_raw_data!(value)
update!(
raw_data: value,
raw_data_archived: false,
raw_data_archive_id: nil
)
end
# Cache key for long-term archive caching
def archive_cache_key
"raw_data:archive:#{self.class.name.underscore}:#{id}"
end
private
def fetch_archived_raw_data
# Check temporary restore cache first (for migrations)
cached = check_temporary_restore_cache
return cached if cached
# Check long-term cache (1 day TTL)
Rails.cache.fetch(archive_cache_key, expires_in: 1.day) do
fetch_from_archive_file
end
rescue StandardError => e
handle_archive_fetch_error(e)
end
def check_temporary_restore_cache
return nil unless respond_to?(:timestamp)
recorded_time = Time.at(timestamp)
cache_key = "raw_data:temp:#{user_id}:#{recorded_time.year}:#{recorded_time.month}:#{id}"
Rails.cache.read(cache_key)
end
def fetch_from_archive_file
return {} unless raw_data_archive&.file&.attached?
# Download and search through JSONL
compressed_content = raw_data_archive.file.blob.download
io = StringIO.new(compressed_content)
gz = Zlib::GzipReader.new(io)
result = nil
gz.each_line do |line|
data = JSON.parse(line)
if data['id'] == id
result = data['raw_data']
break
end
end
gz.close
result || {}
end
def handle_archive_fetch_error(error)
Rails.logger.error(
"Failed to fetch archived raw_data for #{self.class.name} #{id}: #{error.message}"
)
Sentry.capture_exception(error) if defined?(Sentry)
{} # Graceful degradation
end
end

View file

@ -3,6 +3,7 @@
class Point < ApplicationRecord
include Nearable
include Distanceable
include Archivable
belongs_to :import, optional: true, counter_cache: true
belongs_to :visit, optional: true

View file

@ -0,0 +1,48 @@
# frozen_string_literal: true
module Points
class RawDataArchive < ApplicationRecord
self.table_name = 'points_raw_data_archives'
belongs_to :user
has_many :points, foreign_key: :raw_data_archive_id, dependent: :nullify
has_one_attached :file
validates :year, :month, :chunk_number, :point_count, presence: true
validates :year, numericality: { greater_than: 1970, less_than: 2100 }
validates :month, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 12 }
validates :chunk_number, numericality: { greater_than: 0 }
validates :point_ids_checksum, presence: true
validate :file_must_be_attached, on: :update
scope :for_month, ->(user_id, year, month) {
where(user_id: user_id, year: year, month: month)
.order(:chunk_number)
}
scope :recent, -> { where('archived_at > ?', 30.days.ago) }
scope :old, -> { where('archived_at < ?', 1.year.ago) }
def month_display
Date.new(year, month, 1).strftime('%B %Y')
end
def filename
"raw_data_#{user_id}_#{year}_#{format('%02d', month)}_chunk#{format('%03d', chunk_number)}.jsonl.gz"
end
def size_mb
return 0 unless file.attached?
(file.blob.byte_size / 1024.0 / 1024.0).round(2)
end
private
def file_must_be_attached
errors.add(:file, 'must be attached') unless file.attached?
end
end
end

View file

@ -20,6 +20,7 @@ class User < ApplicationRecord # rubocop:disable Metrics/ClassLength
has_many :tags, dependent: :destroy
has_many :trips, dependent: :destroy
has_many :tracks, dependent: :destroy
has_many :raw_data_archives, class_name: 'Points::RawDataArchive', dependent: :destroy
after_create :create_api_key
after_commit :activate, on: :create, if: -> { DawarichSettings.self_hosted? }

View file

@ -0,0 +1,155 @@
# frozen_string_literal: true
module Points
module RawData
class Archiver
SAFE_ARCHIVE_LAG = 2.months
def initialize
@stats = { processed: 0, archived: 0, failed: 0 }
end
def call
unless archival_enabled?
Rails.logger.info('Raw data archival disabled (ARCHIVE_RAW_DATA != "true")')
return @stats
end
Rails.logger.info('Starting points raw_data archival...')
archivable_months.find_each do |month_data|
process_month(month_data)
end
Rails.logger.info("Archival complete: #{@stats}")
@stats
end
def archive_specific_month(user_id, year, month)
month_data = {
'user_id' => user_id,
'year' => year,
'month' => month
}
process_month(month_data)
end
private
def archival_enabled?
ENV['ARCHIVE_RAW_DATA'] == 'true'
end
def archivable_months
# Only months 2+ months old with unarchived points
safe_cutoff = Date.current.beginning_of_month - SAFE_ARCHIVE_LAG
Point.select(
'user_id',
'EXTRACT(YEAR FROM to_timestamp(timestamp))::int as year',
'EXTRACT(MONTH FROM to_timestamp(timestamp))::int as month',
'COUNT(*) as unarchived_count'
).where(raw_data_archived: false)
.where('to_timestamp(timestamp) < ?', safe_cutoff)
.group('user_id, EXTRACT(YEAR FROM to_timestamp(timestamp)), EXTRACT(MONTH FROM to_timestamp(timestamp))')
end
def process_month(month_data)
user_id = month_data['user_id']
year = month_data['year']
month = month_data['month']
lock_key = "archive_points:#{user_id}:#{year}:#{month}"
# Advisory lock prevents duplicate processing
ActiveRecord::Base.with_advisory_lock(lock_key, timeout_seconds: 0) do
archive_month(user_id, year, month)
@stats[:processed] += 1
end
rescue ActiveRecord::AdvisoryLockError
Rails.logger.info("Skipping #{lock_key} - already locked")
rescue StandardError => e
Rails.logger.error("Archive failed for #{user_id}/#{year}/#{month}: #{e.message}")
Sentry.capture_exception(e) if defined?(Sentry)
@stats[:failed] += 1
end
def archive_month(user_id, year, month)
# Calculate timestamp range for the month
start_of_month = Time.new(year, month, 1).to_i
end_of_month = (Time.new(year, month, 1) + 1.month).to_i
# Find unarchived points for this month
points = Point.where(
user_id: user_id,
raw_data_archived: false
).where(timestamp: start_of_month...end_of_month)
.where.not(raw_data: nil) # Skip already-NULLed points
return if points.empty?
point_ids = points.pluck(:id)
Rails.logger.info("Archiving #{point_ids.count} points for user #{user_id}, #{year}-#{format('%02d', month)}")
# Create archive chunk
archive = create_archive_chunk(user_id, year, month, points, point_ids)
# Atomically mark points and NULL raw_data
Point.transaction do
Point.where(id: point_ids).update_all(
raw_data_archived: true,
raw_data_archive_id: archive.id,
raw_data: nil # Reclaim space!
)
end
@stats[:archived] += point_ids.count
Rails.logger.info("✓ Archived chunk #{archive.chunk_number} (#{archive.size_mb} MB)")
end
def create_archive_chunk(user_id, year, month, points, point_ids)
# Determine chunk number (append-only)
chunk_number = Points::RawDataArchive
.where(user_id: user_id, year: year, month: month)
.maximum(:chunk_number).to_i + 1
# Compress points data
compressed_data = Points::RawData::ChunkCompressor.new(points).compress
# Create archive record
archive = Points::RawDataArchive.create!(
user_id: user_id,
year: year,
month: month,
chunk_number: chunk_number,
point_count: point_ids.count,
point_ids_checksum: calculate_checksum(point_ids),
archived_at: Time.current,
metadata: {
format_version: 1,
compression: 'gzip',
archived_by: 'Points::RawData::Archiver'
}
)
# Attach compressed file via ActiveStorage
filename = "raw_data_#{user_id}_#{year}_#{format('%02d', month)}_chunk#{format('%03d', chunk_number)}.jsonl.gz"
archive.file.attach(
io: StringIO.new(compressed_data),
filename: filename,
content_type: 'application/gzip'
)
archive
end
def calculate_checksum(point_ids)
Digest::SHA256.hexdigest(point_ids.sort.join(','))
end
end
end
end

View file

@ -0,0 +1,25 @@
# frozen_string_literal: true
module Points
module RawData
class ChunkCompressor
def initialize(points_relation)
@points = points_relation
end
def compress
io = StringIO.new
gz = Zlib::GzipWriter.new(io)
# Stream points to avoid memory issues with large months
@points.select(:id, :raw_data).find_each(batch_size: 1000) do |point|
# Write as JSONL (one JSON object per line)
gz.puts({ id: point.id, raw_data: point.raw_data }.to_json)
end
gz.close
io.string # Returns compressed bytes
end
end
end
end

View file

@ -0,0 +1,106 @@
# frozen_string_literal: true
module Points
module RawData
class Restorer
def restore_to_database(user_id, year, month)
archives = Points::RawDataArchive.for_month(user_id, year, month)
raise "No archives found for user #{user_id}, #{year}-#{month}" if archives.empty?
Rails.logger.info("Restoring #{archives.count} archives to database...")
Point.transaction do
archives.each do |archive|
restore_archive_to_db(archive)
end
end
Rails.logger.info("✓ Restored #{archives.sum(:point_count)} points")
end
def restore_to_memory(user_id, year, month)
archives = Points::RawDataArchive.for_month(user_id, year, month)
raise "No archives found for user #{user_id}, #{year}-#{month}" if archives.empty?
Rails.logger.info("Loading #{archives.count} archives into cache...")
cache_key_prefix = "raw_data:temp:#{user_id}:#{year}:#{month}"
count = 0
archives.each do |archive|
count += restore_archive_to_cache(archive, cache_key_prefix)
end
Rails.logger.info("✓ Loaded #{count} points into cache (expires in 1 hour)")
end
def restore_all_for_user(user_id)
archives = Points::RawDataArchive.where(user_id: user_id)
.select(:year, :month)
.distinct
.order(:year, :month)
Rails.logger.info("Restoring #{archives.count} months for user #{user_id}...")
archives.each do |archive|
restore_to_database(user_id, archive.year, archive.month)
end
Rails.logger.info("✓ Complete user restore finished")
end
private
def restore_archive_to_db(archive)
decompressed = download_and_decompress(archive)
decompressed.each_line do |line|
data = JSON.parse(line)
Point.where(id: data['id']).update_all(
raw_data: data['raw_data'],
raw_data_archived: false,
raw_data_archive_id: nil
)
end
end
def restore_archive_to_cache(archive, cache_key_prefix)
decompressed = download_and_decompress(archive)
count = 0
decompressed.each_line do |line|
data = JSON.parse(line)
Rails.cache.write(
"#{cache_key_prefix}:#{data['id']}",
data['raw_data'],
expires_in: 1.hour
)
count += 1
end
count
end
def download_and_decompress(archive)
# Download via ActiveStorage
compressed_content = archive.file.blob.download
# Decompress
io = StringIO.new(compressed_content)
gz = Zlib::GzipReader.new(io)
content = gz.read
gz.close
content
rescue StandardError => e
Rails.logger.error("Failed to download/decompress archive #{archive.id}: #{e.message}")
raise
end
end
end
end

View file

@ -0,0 +1,23 @@
# frozen_string_literal: true
class CreatePointsRawDataArchives < ActiveRecord::Migration[8.0]
def change
create_table :points_raw_data_archives do |t|
t.bigint :user_id, null: false
t.integer :year, null: false
t.integer :month, null: false
t.integer :chunk_number, null: false, default: 1
t.integer :point_count, null: false
t.string :point_ids_checksum, null: false
t.jsonb :metadata, default: {}, null: false
t.datetime :archived_at, null: false
t.timestamps
end
add_index :points_raw_data_archives, :user_id
add_index :points_raw_data_archives, [:user_id, :year, :month]
add_index :points_raw_data_archives, :archived_at
add_foreign_key :points_raw_data_archives, :users, validate: false
end
end

View file

@ -0,0 +1,22 @@
# frozen_string_literal: true
class AddArchivalColumnsToPoints < ActiveRecord::Migration[8.0]
disable_ddl_transaction!
def change
add_column :points, :raw_data_archived, :boolean, default: false, null: false
add_column :points, :raw_data_archive_id, :bigint, null: true
add_index :points, :raw_data_archived,
where: 'raw_data_archived = true',
name: 'index_points_on_archived_true',
algorithm: :concurrently
add_index :points, :raw_data_archive_id,
algorithm: :concurrently
add_foreign_key :points, :points_raw_data_archives,
column: :raw_data_archive_id,
on_delete: :nullify, # Don't delete points if archive deleted
validate: false
end
end

View file

@ -0,0 +1,8 @@
# frozen_string_literal: true
class ValidateArchivalForeignKeys < ActiveRecord::Migration[8.0]
def change
validate_foreign_key :points_raw_data_archives, :users
validate_foreign_key :points, :points_raw_data_archives
end
end

27
db/schema.rb generated
View file

@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[8.0].define(version: 2025_12_01_192510) do
ActiveRecord::Schema[8.0].define(version: 2025_12_06_000004) do
# These are extensions that must be enabled in order to support this database
enable_extension "pg_catalog.plpgsql"
enable_extension "postgis"
@ -224,6 +224,10 @@ ActiveRecord::Schema[8.0].define(version: 2025_12_01_192510) do
t.bigint "country_id"
t.bigint "track_id"
t.string "country_name"
t.boolean "raw_data_archived", default: false, null: false
t.bigint "raw_data_archive_id"
t.integer "timestamp_year"
t.integer "timestamp_month"
t.index ["altitude"], name: "index_points_on_altitude"
t.index ["battery"], name: "index_points_on_battery"
t.index ["battery_status"], name: "index_points_on_battery_status"
@ -238,6 +242,8 @@ ActiveRecord::Schema[8.0].define(version: 2025_12_01_192510) do
t.index ["latitude", "longitude"], name: "index_points_on_latitude_and_longitude"
t.index ["lonlat", "timestamp", "user_id"], name: "index_points_on_lonlat_timestamp_user_id", unique: true
t.index ["lonlat"], name: "index_points_on_lonlat", using: :gist
t.index ["raw_data_archive_id"], name: "index_points_on_raw_data_archive_id"
t.index ["raw_data_archived"], name: "index_points_on_archived_true", where: "(raw_data_archived = true)"
t.index ["reverse_geocoded_at"], name: "index_points_on_reverse_geocoded_at"
t.index ["timestamp"], name: "index_points_on_timestamp"
t.index ["track_id"], name: "index_points_on_track_id"
@ -245,10 +251,27 @@ ActiveRecord::Schema[8.0].define(version: 2025_12_01_192510) do
t.index ["user_id", "country_name"], name: "idx_points_user_country_name"
t.index ["user_id", "reverse_geocoded_at"], name: "index_points_on_user_id_and_reverse_geocoded_at", where: "(reverse_geocoded_at IS NOT NULL)"
t.index ["user_id", "timestamp", "track_id"], name: "idx_points_track_generation"
t.index ["user_id", "timestamp_year", "timestamp_month", "raw_data_archived"], name: "index_points_on_user_time_archived"
t.index ["user_id"], name: "index_points_on_user_id"
t.index ["visit_id"], name: "index_points_on_visit_id"
end
create_table "points_raw_data_archives", force: :cascade do |t|
t.bigint "user_id", null: false
t.integer "year", null: false
t.integer "month", null: false
t.integer "chunk_number", default: 1, null: false
t.integer "point_count", null: false
t.string "point_ids_checksum", null: false
t.jsonb "metadata", default: {}, null: false
t.datetime "archived_at", null: false
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["archived_at"], name: "index_points_raw_data_archives_on_archived_at"
t.index ["user_id", "year", "month"], name: "index_points_raw_data_archives_on_user_id_and_year_and_month"
t.index ["user_id"], name: "index_points_raw_data_archives_on_user_id"
end
create_table "stats", force: :cascade do |t|
t.integer "year", null: false
t.integer "month", null: false
@ -384,8 +407,10 @@ ActiveRecord::Schema[8.0].define(version: 2025_12_01_192510) do
add_foreign_key "notifications", "users"
add_foreign_key "place_visits", "places"
add_foreign_key "place_visits", "visits"
add_foreign_key "points", "points_raw_data_archives", column: "raw_data_archive_id", on_delete: :nullify
add_foreign_key "points", "users"
add_foreign_key "points", "visits"
add_foreign_key "points_raw_data_archives", "users"
add_foreign_key "stats", "users"
add_foreign_key "taggings", "tags"
add_foreign_key "tags", "users"

View file

@ -0,0 +1,243 @@
# frozen_string_literal: true
namespace :points do
namespace :raw_data do
desc 'Restore raw_data from archive to database for a specific month'
task :restore, [:user_id, :year, :month] => :environment do |_t, args|
validate_args!(args)
user_id = args[:user_id].to_i
year = args[:year].to_i
month = args[:month].to_i
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Restoring raw_data to DATABASE'
puts " User: #{user_id} | Month: #{year}-#{format('%02d', month)}"
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
restorer = Points::RawData::Restorer.new
restorer.restore_to_database(user_id, year, month)
puts ''
puts '✓ Restoration complete!'
puts ''
puts "Points in #{year}-#{month} now have raw_data in database."
puts 'Run VACUUM ANALYZE points; to update statistics.'
end
desc 'Restore raw_data to memory/cache temporarily (for data migrations)'
task :restore_temporary, [:user_id, :year, :month] => :environment do |_t, args|
validate_args!(args)
user_id = args[:user_id].to_i
year = args[:year].to_i
month = args[:month].to_i
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Loading raw_data into CACHE (temporary)'
puts " User: #{user_id} | Month: #{year}-#{format('%02d', month)}"
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
puts 'Data will be available for 1 hour via Point.raw_data_with_archive accessor'
puts ''
restorer = Points::RawData::Restorer.new
restorer.restore_to_memory(user_id, year, month)
puts ''
puts '✓ Cache loaded successfully!'
puts ''
puts 'You can now run your data migration.'
puts 'Example:'
puts " rails runner \"Point.where(user_id: #{user_id}, timestamp_year: #{year}, timestamp_month: #{month}).find_each { |p| p.fix_coordinates_from_raw_data }\""
puts ''
puts 'Cache will expire in 1 hour automatically.'
end
desc 'Restore all archived raw_data for a user'
task :restore_all, [:user_id] => :environment do |_t, args|
raise 'Usage: rake points:raw_data:restore_all[user_id]' unless args[:user_id]
user_id = args[:user_id].to_i
user = User.find(user_id)
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Restoring ALL archives for user'
puts " #{user.email} (ID: #{user_id})"
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
archives = Points::RawDataArchive.where(user_id: user_id)
.select(:year, :month)
.distinct
.order(:year, :month)
puts "Found #{archives.count} months to restore"
puts ''
archives.each_with_index do |archive, idx|
puts "[#{idx + 1}/#{archives.count}] Restoring #{archive.year}-#{format('%02d', archive.month)}..."
restorer = Points::RawData::Restorer.new
restorer.restore_to_database(user_id, archive.year, archive.month)
end
puts ''
puts "✓ All archives restored for user #{user_id}!"
end
desc 'Show archive statistics'
task status: :environment do
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Points raw_data Archive Statistics'
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
total_archives = Points::RawDataArchive.count
total_points = Point.count
archived_points = Point.where(raw_data_archived: true).count
percentage = total_points.positive? ? (archived_points.to_f / total_points * 100).round(2) : 0
puts "Archives: #{total_archives}"
puts "Points archived: #{archived_points} / #{total_points} (#{percentage}%)"
puts ''
# Storage size via ActiveStorage
total_blob_size = ActiveStorage::Blob
.joins('INNER JOIN active_storage_attachments ON active_storage_attachments.blob_id = active_storage_blobs.id')
.where("active_storage_attachments.record_type = 'Points::RawDataArchive'")
.sum(:byte_size)
puts "Storage used: #{ActiveSupport::NumberHelper.number_to_human_size(total_blob_size)}"
puts ''
# Recent activity
recent = Points::RawDataArchive.where('archived_at > ?', 7.days.ago).count
puts "Archives created last 7 days: #{recent}"
puts ''
# Top users
puts 'Top 10 users by archive count:'
puts '─────────────────────────────────────────────────'
Points::RawDataArchive.group(:user_id)
.select('user_id, COUNT(*) as archive_count, SUM(point_count) as total_points')
.order('archive_count DESC')
.limit(10)
.each_with_index do |stat, idx|
user = User.find(stat.user_id)
puts "#{idx + 1}. #{user.email.ljust(30)} #{stat.archive_count.to_s.rjust(3)} archives, #{stat.total_points.to_s.rjust(8)} points"
end
puts ''
end
desc 'Verify archive integrity for a month'
task :verify, [:user_id, :year, :month] => :environment do |_t, args|
validate_args!(args)
user_id = args[:user_id].to_i
year = args[:year].to_i
month = args[:month].to_i
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Verifying Archives'
puts " User: #{user_id} | Month: #{year}-#{format('%02d', month)}"
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
archives = Points::RawDataArchive.for_month(user_id, year, month)
if archives.empty?
puts 'No archives found.'
exit
end
all_ok = true
archives.each do |archive|
print "Chunk #{archive.chunk_number}: "
# Check file attached
unless archive.file.attached?
puts '✗ ERROR - File not attached!'
all_ok = false
next
end
# Download and count
begin
compressed = archive.file.blob.download
io = StringIO.new(compressed)
gz = Zlib::GzipReader.new(io)
actual_count = 0
gz.each_line { actual_count += 1 }
gz.close
if actual_count == archive.point_count
puts "✓ OK (#{actual_count} points, #{archive.size_mb} MB)"
else
puts "✗ MISMATCH - Expected #{archive.point_count}, found #{actual_count}"
all_ok = false
end
rescue StandardError => e
puts "✗ ERROR - #{e.message}"
all_ok = false
end
end
puts ''
if all_ok
puts '✓ All archives verified successfully!'
else
puts '✗ Some archives have issues. Please investigate.'
end
end
desc 'Run initial archival for old data (safe to re-run)'
task initial_archive: :environment do
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Initial Archival (2+ months old data)'
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
puts 'This will archive points.raw_data for months 2+ months old.'
puts 'This is safe to run multiple times (idempotent).'
puts ''
print 'Continue? (y/N): '
response = $stdin.gets.chomp.downcase
unless response == 'y'
puts 'Cancelled.'
exit
end
puts ''
stats = Points::RawData::Archiver.new.call
puts ''
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ' Archival Complete'
puts '━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'
puts ''
puts "Months processed: #{stats[:processed]}"
puts "Points archived: #{stats[:archived]}"
puts "Failures: #{stats[:failed]}"
puts ''
return unless stats[:archived].positive?
puts 'Next steps:'
puts '1. Verify a sample: rake points:raw_data:verify[user_id,year,month]'
puts '2. Check stats: rake points:raw_data:status'
puts '3. (Optional) Reclaim space: VACUUM FULL points; (during maintenance)'
end
end
end
def validate_args!(args)
return if args[:user_id] && args[:year] && args[:month]
raise 'Usage: rake points:raw_data:TASK[user_id,year,month]'
end