dawarich/lib/tasks/data_cleanup.rake

# frozen_string_literal: true

require 'csv'

namespace :data_cleanup do
  desc 'Remove duplicate points using raw SQL and export them to a file'
  task remove_duplicate_points: :environment do
    timestamp = Time.current.strftime('%Y%m%d%H%M%S')
    export_path = Rails.root.join("tmp/duplicate_points_#{timestamp}.csv")
    connection = ActiveRecord::Base.connection

    puts 'Finding duplicates...'

    # First create temp tables for each duplicate type separately
    connection.execute(<<~SQL)
      DROP TABLE IF EXISTS lat_long_duplicates;
      CREATE TEMPORARY TABLE lat_long_duplicates AS
      SELECT id
      FROM (
        SELECT id,
               ROW_NUMBER() OVER (PARTITION BY latitude, longitude, timestamp, user_id ORDER BY id) as row_num
        FROM points
      ) AS dups
      WHERE dups.row_num > 1;
    SQL

    connection.execute(<<~SQL)
      DROP TABLE IF EXISTS lonlat_duplicates;
      CREATE TEMPORARY TABLE lonlat_duplicates AS
      SELECT id
      FROM (
        SELECT id,
               ROW_NUMBER() OVER (PARTITION BY lonlat, timestamp, user_id ORDER BY id) as row_num
        FROM points
      ) AS dups
      WHERE dups.row_num > 1;
    SQL

    # Then create the combined duplicates table
    connection.execute(<<~SQL)
      DROP TABLE IF EXISTS duplicate_points;
      CREATE TEMPORARY TABLE duplicate_points AS
      SELECT id FROM lat_long_duplicates
      UNION
      SELECT id FROM lonlat_duplicates;
    SQL

    # Count duplicates
    duplicate_count = connection.select_value('SELECT COUNT(*) FROM duplicate_points').to_i
    puts "Found #{duplicate_count} duplicate points"

    if duplicate_count > 0
      # Export duplicates to CSV
      puts "Exporting duplicates to #{export_path}..."

      columns = connection.select_values("SELECT column_name FROM information_schema.columns WHERE table_name = 'points' ORDER BY ordinal_position")

      CSV.open(export_path, 'wb') do |csv|
        # Write headers
        csv << columns

        # Export data in batches to avoid memory issues
        offset = 0
        batch_size = 1000

        loop do
          sql = <<~SQL
            SELECT #{columns.join(',')}
            FROM points
            WHERE id IN (SELECT id FROM duplicate_points)
            ORDER BY id
            LIMIT #{batch_size} OFFSET #{offset};
          SQL

          records = connection.select_all(sql)
          break if records.empty?

          records.each do |record|
            csv << columns.map { |col| record[col] }
          end

          offset += batch_size
          print '.' if (offset % 10_000).zero?
        end
      end

      puts "\nSuccessfully exported #{duplicate_count} duplicate points to #{export_path}"

      # Delete the duplicates
      deleted_count = connection.execute(<<~SQL)
        DELETE FROM points
        WHERE id IN (SELECT id FROM duplicate_points);
      SQL

      puts "Successfully deleted #{deleted_count.cmd_tuples} duplicate points"

      # Clean up
      connection.execute('DROP TABLE IF EXISTS lat_long_duplicates;')
      connection.execute('DROP TABLE IF EXISTS lonlat_duplicates;')
      connection.execute('DROP TABLE IF EXISTS duplicate_points;')
    else
      puts 'No duplicate points to remove'
    end
  end
end