dawarich/lib/tasks/data_cleanup.rake
2025-03-21 18:16:23 +01:00

105 lines
3.2 KiB
Ruby

# frozen_string_literal: true
require 'csv'
namespace :data_cleanup do
desc 'Remove duplicate points using raw SQL and export them to a file'
task remove_duplicate_points: :environment do
timestamp = Time.current.strftime('%Y%m%d%H%M%S')
export_path = Rails.root.join("tmp/duplicate_points_#{timestamp}.csv")
connection = ActiveRecord::Base.connection
puts 'Finding duplicates...'
# First create temp tables for each duplicate type separately
connection.execute(<<~SQL)
DROP TABLE IF EXISTS lat_long_duplicates;
CREATE TEMPORARY TABLE lat_long_duplicates AS
SELECT id
FROM (
SELECT id,
ROW_NUMBER() OVER (PARTITION BY latitude, longitude, timestamp, user_id ORDER BY id) as row_num
FROM points
) AS dups
WHERE dups.row_num > 1;
SQL
connection.execute(<<~SQL)
DROP TABLE IF EXISTS lonlat_duplicates;
CREATE TEMPORARY TABLE lonlat_duplicates AS
SELECT id
FROM (
SELECT id,
ROW_NUMBER() OVER (PARTITION BY lonlat, timestamp, user_id ORDER BY id) as row_num
FROM points
) AS dups
WHERE dups.row_num > 1;
SQL
# Then create the combined duplicates table
connection.execute(<<~SQL)
DROP TABLE IF EXISTS duplicate_points;
CREATE TEMPORARY TABLE duplicate_points AS
SELECT id FROM lat_long_duplicates
UNION
SELECT id FROM lonlat_duplicates;
SQL
# Count duplicates
duplicate_count = connection.select_value('SELECT COUNT(*) FROM duplicate_points').to_i
puts "Found #{duplicate_count} duplicate points"
if duplicate_count > 0
# Export duplicates to CSV
puts "Exporting duplicates to #{export_path}..."
columns = connection.select_values("SELECT column_name FROM information_schema.columns WHERE table_name = 'points' ORDER BY ordinal_position")
CSV.open(export_path, 'wb') do |csv|
# Write headers
csv << columns
# Export data in batches to avoid memory issues
offset = 0
batch_size = 1000
loop do
sql = <<~SQL
SELECT #{columns.join(',')}
FROM points
WHERE id IN (SELECT id FROM duplicate_points)
ORDER BY id
LIMIT #{batch_size} OFFSET #{offset};
SQL
records = connection.select_all(sql)
break if records.empty?
records.each do |record|
csv << columns.map { |col| record[col] }
end
offset += batch_size
print '.' if (offset % 10_000).zero?
end
end
puts "\nSuccessfully exported #{duplicate_count} duplicate points to #{export_path}"
# Delete the duplicates
deleted_count = connection.execute(<<~SQL)
DELETE FROM points
WHERE id IN (SELECT id FROM duplicate_points);
SQL
puts "Successfully deleted #{deleted_count.cmd_tuples} duplicate points"
# Clean up
connection.execute('DROP TABLE IF EXISTS lat_long_duplicates;')
connection.execute('DROP TABLE IF EXISTS lonlat_duplicates;')
connection.execute('DROP TABLE IF EXISTS duplicate_points;')
else
puts 'No duplicate points to remove'
end
end
end