diff --git a/.app_version b/.app_version index 0f721773..e8262eb5 100644 --- a/.app_version +++ b/.app_version @@ -1 +1 @@ -0.30.2 +0.30.3 diff --git a/CHANGELOG.md b/CHANGELOG.md index e7c5a882..a51f35d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +# [0.30.3] - 2025-07-23 + +## Changed + +- Track generation is now significantly faster and less resource intensive. + + # [0.30.2] - 2025-07-22 ## Fixed diff --git a/db/migrate/20250723164055_add_track_generation_composite_index.rb b/db/migrate/20250723164055_add_track_generation_composite_index.rb new file mode 100644 index 00000000..1685011a --- /dev/null +++ b/db/migrate/20250723164055_add_track_generation_composite_index.rb @@ -0,0 +1,9 @@ +class AddTrackGenerationCompositeIndex < ActiveRecord::Migration[8.0] + disable_ddl_transaction! + + def change + add_index :points, [:user_id, :timestamp, :track_id], + algorithm: :concurrently, + name: 'idx_points_track_generation', if_not_exists: true + end +end diff --git a/db/schema.rb b/db/schema.rb index 402729b9..903fe090 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_07_21_204404) do +ActiveRecord::Schema[8.0].define(version: 2025_07_03_193657) do # These are extensions that must be enabled in order to support this database enable_extension "pg_catalog.plpgsql" enable_extension "postgis" @@ -146,7 +146,6 @@ ActiveRecord::Schema[8.0].define(version: 2025_07_21_204404) do t.datetime "created_at", null: false t.datetime "updated_at", null: false t.geography "lonlat", limit: {srid: 4326, type: "st_point", geographic: true} - t.index "(((geodata -> 'properties'::text) ->> 'osm_id'::text))", name: "index_places_on_geodata_osm_id" t.index ["lonlat"], name: "index_places_on_lonlat", using: :gist end diff --git a/lib/optimized_tracks_v1.rb b/lib/optimized_tracks_v1.rb deleted file mode 100644 index 08969aa4..00000000 --- a/lib/optimized_tracks_v1.rb +++ /dev/null @@ -1,145 +0,0 @@ -# frozen_string_literal: true - -# Optimization V1: LAG-based distance calculation with Ruby segmentation -# This keeps the existing Ruby segmentation logic but uses PostgreSQL LAG -# for batch distance calculations instead of individual queries - -module OptimizedTracksV1 - extend ActiveSupport::Concern - - module ClassMethods - # V1: Use LAG to get all consecutive distances in a single query - def calculate_all_consecutive_distances(points) - return [] if points.length < 2 - - point_ids = points.map(&:id).join(',') - - results = connection.execute(<<-SQL.squish) - WITH points_with_previous AS ( - SELECT - id, - timestamp, - lonlat, - LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat, - LAG(timestamp) OVER (ORDER BY timestamp) as prev_timestamp, - LAG(id) OVER (ORDER BY timestamp) as prev_id - FROM points - WHERE id IN (#{point_ids}) - ) - SELECT - id, - prev_id, - timestamp, - prev_timestamp, - ST_Distance(lonlat::geography, prev_lonlat::geography) as distance_meters, - (timestamp - prev_timestamp) as time_diff_seconds - FROM points_with_previous - WHERE prev_lonlat IS NOT NULL - ORDER BY timestamp - SQL - - # Return hash mapping point_id => {distance_to_previous, time_diff} - distance_map = {} - results.each do |row| - distance_map[row['id'].to_i] = { - distance_meters: row['distance_meters'].to_f, - time_diff_seconds: row['time_diff_seconds'].to_i, - prev_id: row['prev_id'].to_i - } - end - - distance_map - end - - # V1: Optimized total distance using LAG (already exists in distanceable.rb) - def total_distance_lag(points, unit = :m) - unless ::DISTANCE_UNITS.key?(unit.to_sym) - raise ArgumentError, "Invalid unit. Supported units are: #{::DISTANCE_UNITS.keys.join(', ')}" - end - - return 0 if points.length < 2 - - point_ids = points.map(&:id).join(',') - - distance_in_meters = connection.select_value(<<-SQL.squish) - WITH points_with_previous AS ( - SELECT - lonlat, - LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat - FROM points - WHERE id IN (#{point_ids}) - ) - SELECT COALESCE( - SUM(ST_Distance(lonlat::geography, prev_lonlat::geography)), - 0 - ) - FROM points_with_previous - WHERE prev_lonlat IS NOT NULL - SQL - - distance_in_meters.to_f / ::DISTANCE_UNITS[unit.to_sym] - end - end -end - -# Optimized segmentation module using pre-calculated distances -module OptimizedSegmentationV1 - extend ActiveSupport::Concern - - private - - def split_points_into_segments_v1(points) - return [] if points.empty? - - # V1: Pre-calculate all distances and time diffs in one query - if points.size > 1 - distance_data = Point.calculate_all_consecutive_distances(points) - else - distance_data = {} - end - - segments = [] - current_segment = [] - - points.each do |point| - if current_segment.empty? - # First point always starts a segment - current_segment = [point] - elsif should_start_new_segment_v1?(point, current_segment.last, distance_data) - # Finalize current segment if it has enough points - segments << current_segment if current_segment.size >= 2 - current_segment = [point] - else - current_segment << point - end - end - - # Don't forget the last segment - segments << current_segment if current_segment.size >= 2 - - segments - end - - def should_start_new_segment_v1?(current_point, previous_point, distance_data) - return false if previous_point.nil? - - # Get pre-calculated data for this point - point_data = distance_data[current_point.id] - return false unless point_data - - # Check time threshold - time_threshold_seconds = time_threshold_minutes.to_i * 60 - return true if point_data[:time_diff_seconds] > time_threshold_seconds - - # Check distance threshold - distance_meters = point_data[:distance_meters] - return true if distance_meters > distance_threshold_meters - - false - end -end - -# Add methods to Point class -class Point - extend OptimizedTracksV1::ClassMethods -end \ No newline at end of file diff --git a/lib/optimized_tracks_v2.rb b/lib/optimized_tracks_v2.rb deleted file mode 100644 index 03981fe6..00000000 --- a/lib/optimized_tracks_v2.rb +++ /dev/null @@ -1,291 +0,0 @@ -# frozen_string_literal: true - -# Optimization V2: Full SQL segmentation using PostgreSQL window functions -# This does both distance calculation AND segmentation entirely in SQL - -module OptimizedTracksV2 - extend ActiveSupport::Concern - - module ClassMethods - # V2: Complete segmentation in SQL using LAG and window functions - def segment_points_in_sql(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters) - time_threshold_seconds = time_threshold_minutes * 60 - - sql = <<~SQL - WITH points_with_gaps AS ( - SELECT - id, - timestamp, - lonlat, - LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat, - LAG(timestamp) OVER (ORDER BY timestamp) as prev_timestamp, - ST_Distance( - lonlat::geography, - LAG(lonlat) OVER (ORDER BY timestamp)::geography - ) as distance_meters, - (timestamp - LAG(timestamp) OVER (ORDER BY timestamp)) as time_diff_seconds - FROM points - WHERE user_id = $1 - AND timestamp BETWEEN $2 AND $3 - ORDER BY timestamp - ), - segment_breaks AS ( - SELECT *, - CASE - WHEN prev_lonlat IS NULL THEN 1 - WHEN time_diff_seconds > $4 THEN 1 - WHEN distance_meters > $5 THEN 1 - ELSE 0 - END as is_break - FROM points_with_gaps - ), - segments AS ( - SELECT *, - SUM(is_break) OVER (ORDER BY timestamp ROWS UNBOUNDED PRECEDING) as segment_id - FROM segment_breaks - ) - SELECT - segment_id, - array_agg(id ORDER BY timestamp) as point_ids, - count(*) as point_count, - min(timestamp) as start_timestamp, - max(timestamp) as end_timestamp, - sum(COALESCE(distance_meters, 0)) as total_distance_meters - FROM segments - GROUP BY segment_id - HAVING count(*) >= 2 - ORDER BY segment_id - SQL - - results = connection.exec_query( - sql, - 'segment_points_in_sql', - [user_id, start_timestamp, end_timestamp, time_threshold_seconds, distance_threshold_meters] - ) - - # Convert results to segment data - segments_data = [] - results.each do |row| - segments_data << { - segment_id: row['segment_id'].to_i, - point_ids: parse_postgres_array(row['point_ids']), - point_count: row['point_count'].to_i, - start_timestamp: row['start_timestamp'].to_i, - end_timestamp: row['end_timestamp'].to_i, - total_distance_meters: row['total_distance_meters'].to_f - } - end - - segments_data - end - - # V2: Get actual Point objects for each segment - def get_segments_with_points(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters) - segments_data = segment_points_in_sql(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters) - - # Get all point IDs we need - all_point_ids = segments_data.flat_map { |seg| seg[:point_ids] } - - # Single query to get all points - points_by_id = Point.where(id: all_point_ids).index_by(&:id) - - # Build segments with actual Point objects - segments_data.map do |seg_data| - { - points: seg_data[:point_ids].map { |id| points_by_id[id] }.compact, - pre_calculated_distance: seg_data[:total_distance_meters], - start_timestamp: seg_data[:start_timestamp], - end_timestamp: seg_data[:end_timestamp] - } - end - end - - private - - # Parse PostgreSQL array format like "{1,2,3}" into Ruby array - def parse_postgres_array(pg_array_string) - return [] if pg_array_string.nil? || pg_array_string.empty? - - # Remove curly braces and split by comma - pg_array_string.gsub(/[{}]/, '').split(',').map(&:to_i) - end - end -end - -# Optimized generator using V2 SQL segmentation -class OptimizedTracksGeneratorV2 - attr_reader :user, :start_at, :end_at, :mode - - def initialize(user, start_at: nil, end_at: nil, mode: :bulk) - @user = user - @start_at = start_at - @end_at = end_at - @mode = mode.to_sym - end - - def call - clean_existing_tracks if should_clean_tracks? - - # Get timestamp range for SQL query - start_timestamp, end_timestamp = get_timestamp_range - - Rails.logger.debug "OptimizedGeneratorV2: querying points for user #{user.id} in #{mode} mode" - - # V2: Get segments directly from SQL with pre-calculated distances - segments = Point.get_segments_with_points( - user.id, - start_timestamp, - end_timestamp, - time_threshold_minutes, - distance_threshold_meters - ) - - Rails.logger.debug "OptimizedGeneratorV2: created #{segments.size} segments via SQL" - - tracks_created = 0 - - segments.each do |segment_data| - track = create_track_from_segment_v2(segment_data) - tracks_created += 1 if track - end - - Rails.logger.info "Generated #{tracks_created} tracks for user #{user.id} in optimized V2 #{mode} mode" - tracks_created - end - - private - - def create_track_from_segment_v2(segment_data) - points = segment_data[:points] - pre_calculated_distance = segment_data[:pre_calculated_distance] - - Rails.logger.debug "OptimizedGeneratorV2: processing segment with #{points.size} points" - return unless points.size >= 2 - - track = Track.new( - user_id: user.id, - start_at: Time.zone.at(points.first.timestamp), - end_at: Time.zone.at(points.last.timestamp), - original_path: build_path(points) - ) - - # V2: Use pre-calculated distance from SQL - track.distance = pre_calculated_distance.round - track.duration = calculate_duration(points) - track.avg_speed = calculate_average_speed(track.distance, track.duration) - - # Calculate elevation statistics (no DB queries needed) - elevation_stats = calculate_elevation_stats(points) - track.elevation_gain = elevation_stats[:gain] - track.elevation_loss = elevation_stats[:loss] - track.elevation_max = elevation_stats[:max] - track.elevation_min = elevation_stats[:min] - - if track.save - Point.where(id: points.map(&:id)).update_all(track_id: track.id) - track - else - Rails.logger.error "Failed to create track for user #{user.id}: #{track.errors.full_messages.join(', ')}" - nil - end - end - - def get_timestamp_range - case mode - when :bulk - if start_at && end_at - [start_at.to_i, end_at.to_i] - else - # Get full range for user - first_point = user.tracked_points.order(:timestamp).first - last_point = user.tracked_points.order(:timestamp).last - [first_point&.timestamp || 0, last_point&.timestamp || Time.current.to_i] - end - when :daily - day = start_at&.to_date || Date.current - [day.beginning_of_day.to_i, day.end_of_day.to_i] - when :incremental - # For incremental, we need all untracked points up to end_at - first_point = user.tracked_points.where(track_id: nil).order(:timestamp).first - end_timestamp = end_at ? end_at.to_i : Time.current.to_i - [first_point&.timestamp || 0, end_timestamp] - end - end - - def should_clean_tracks? - case mode - when :bulk, :daily then true - else false - end - end - - def clean_existing_tracks - case mode - when :bulk - scope = user.tracks - if start_at && end_at - scope = scope.where(start_at: start_at..end_at) - end - scope.destroy_all - when :daily - day = start_at&.to_date || Date.current - range = day.beginning_of_day..day.end_of_day - user.tracks.where(start_at: range).destroy_all - end - end - - # Helper methods (same as original) - def build_path(points) - Tracks::BuildPath.new(points).call - end - - def calculate_duration(points) - points.last.timestamp - points.first.timestamp - end - - def calculate_average_speed(distance_in_meters, duration_seconds) - return 0.0 if duration_seconds <= 0 || distance_in_meters <= 0 - - speed_mps = distance_in_meters.to_f / duration_seconds - (speed_mps * 3.6).round(2) # m/s to km/h - end - - def calculate_elevation_stats(points) - altitudes = points.map(&:altitude).compact - return { gain: 0, loss: 0, max: 0, min: 0 } if altitudes.empty? - - elevation_gain = 0 - elevation_loss = 0 - previous_altitude = altitudes.first - - altitudes[1..].each do |altitude| - diff = altitude - previous_altitude - if diff > 0 - elevation_gain += diff - else - elevation_loss += diff.abs - end - previous_altitude = altitude - end - - { - gain: elevation_gain.round, - loss: elevation_loss.round, - max: altitudes.max, - min: altitudes.min - } - end - - def distance_threshold_meters - @distance_threshold_meters ||= user.safe_settings.meters_between_routes.to_i - end - - def time_threshold_minutes - @time_threshold_minutes ||= user.safe_settings.minutes_between_routes.to_i - end -end - -# Add methods to Point class -class Point - extend OptimizedTracksV2::ClassMethods -end \ No newline at end of file diff --git a/lib/results.md b/lib/results.md deleted file mode 100644 index b9d3bcc4..00000000 --- a/lib/results.md +++ /dev/null @@ -1,122 +0,0 @@ -## Original - -Generator: created track 227296 -Generated 1437 tracks for user 1 in bulk mode -โœ… Generation completed successfully - -============================================================ -๐Ÿ“Š BENCHMARK RESULTS -============================================================ -Status: โœ… SUCCESS -Execution Time: 1m 28.5s -Tracks Created: 1437 -Timeframe Coverage: 8.0% of user's total data - -๐Ÿ’พ Memory Usage: - Start: 210.9MB - End: 433.2MB - Memory Increase: +222.3MB - -๐Ÿ—„๏ธ Database Performance: - Total Queries: 115920 - Total Query Time: 50453.1ms - Average Query Time: 0.44ms - Slow Queries (>100ms): 63 - 1. 983.24ms - SELECT COUNT(*) FROM "points" WHERE "points"."user_id" = $1 AND "points"."timestamp" BETWEEN $2 A... - 2. 2826.02ms - SELECT "points".* FROM "points" WHERE "points"."user_id" = $1 AND "points"."timestamp" BETWEEN $2... - 3. 217.02ms - UPDATE "points" SET "track_id" = $1 WHERE "points"."id" IN ($2, $3, $4, $5, $6, $7, $8, $9, $10, ... - -โœ”๏ธ Post-Generation Validation: - Points in Timeframe: 111609 - Points with Tracks: 110167 - Points without Tracks: 1442 - Track Records: 1437 - โœ… Data integrity: PASS - -๐Ÿ” Performance Analysis: - Speed Rating: ๐Ÿš€ Excellent (1m 28.5s) - Memory Rating: ๐Ÿงก High (433.2MB peak) - Recommendation: Consider database optimization or smaller batch sizes - -๐Ÿ”ฎ Extrapolation for Full Dataset: - Full Dataset Size: 1,403,662 points - Scaling Factor: 12.6x - Estimated Full Time: 18m 32.8s - Estimated Full Memory: 5447.6MB - -============================================================ -๐Ÿ“‹ BENCHMARK SUMMARY -============================================================ -โฑ๏ธ Total Time: 1m 28.5s -๐Ÿ“ Points Processed: 111,609 -๐Ÿ›ค๏ธ Tracks Created: 1437 -๐Ÿš€ Processing Speed: 1261.4 points/second -๐Ÿ“… Timeframe: 2024-01-01 to 2024-12-31 -๐Ÿ‘ค User: demo@dawarich.app (ID: 1) -โœ… Status: COMPLETED - - -## Iteration 1 - -Generator: created track 244784 -Generated 1435 tracks for user 1 in optimized bulk mode -โœ… Generation completed successfully - -============================================================ -๐Ÿ“Š BENCHMARK RESULTS -============================================================ -Status: โœ… SUCCESS -Execution Time: 56.4s -Tracks Created: 1435 -Points Processed: 111,609 -Processing Speed: 1978.3 points/second -Average Points/Track: 77.8 -Timeframe Coverage: 8.0% of user's total data - -๐Ÿ’พ Memory Usage: - Start: 297.2MB - End: 407.5MB - Memory Increase: +110.3MB - -๐Ÿ—„๏ธ Database Performance: - Total Queries: 7178 - Total Query Time: 44521.33ms - Average Query Time: 6.2ms - Slow Queries (>100ms): 88 - 1. 2338.43ms - WITH points_with_gaps AS ( - SELECT - id, - timestamp, - lonlat, - LAG(lonlat) OVER (ORDE... - 2. 4156.84ms - SELECT "points".* FROM "points" WHERE "points"."id" IN (2163775, 2163776, 2163777, 2163778, 21637... - 3. 298.62ms - UPDATE "points" SET "track_id" = $1 WHERE "points"."id" IN ($2, $3, $4, $5, $6, $7, $8, $9, $10, ... - -โœ”๏ธ Post-Generation Validation: - Points in Timeframe: 111609 - Points with Tracks: 110123 - Points without Tracks: 1486 - Track Records: 1435 - โœ… Data integrity: PASS - -๐Ÿ” Performance Analysis: - Speed Rating: ๐Ÿš€ Excellent (56.4s) - Memory Rating: ๐Ÿงก High (407.5MB peak) - Recommendation: Consider database optimization or smaller batch sizes - -๐Ÿ”ฎ Extrapolation for Full Dataset: - Full Dataset Size: 1,403,662 points - Scaling Factor: 12.6x - Estimated Full Time: 11m 49.5s - Estimated Full Memory: 5125.0MB - -============================================================ -๐Ÿ“‹ BENCHMARK SUMMARY -============================================================ -โฑ๏ธ Total Time: 56.4s -๐Ÿ“ Points Processed: 111,609 -๐Ÿ›ค๏ธ Tracks Created: 1435 -๐Ÿš€ Processing Speed: 1978.3 points/second -๐Ÿ“… Timeframe: 2024-01-01 to 2024-12-31 -๐Ÿ‘ค User: demo@dawarich.app (ID: 1) -โœ… Status: COMPLETED diff --git a/lib/tracks_optimization_benchmark.rb b/lib/tracks_optimization_benchmark.rb deleted file mode 100644 index bac160d5..00000000 --- a/lib/tracks_optimization_benchmark.rb +++ /dev/null @@ -1,625 +0,0 @@ -# frozen_string_literal: true - -require_relative 'optimized_tracks_v1' -require_relative 'optimized_tracks_v2' - -# Benchmark script to compare three different track generation approaches: -# - Original: Individual distance queries (current implementation) -# - V1: LAG-based distance pre-calculation with Ruby segmentation -# - V2: Full SQL segmentation with PostgreSQL window functions -# -# Usage: -# rails runner lib/tracks_optimization_benchmark.rb USER_ID START_DATE END_DATE - -class TracksOptimizationBenchmark - attr_reader :user, :start_date, :end_date, :start_timestamp, :end_timestamp - - def initialize(user_id, start_date, end_date) - @user = User.find(user_id) - @start_date = Date.parse(start_date) - @end_date = Date.parse(end_date) - @start_timestamp = @start_date.beginning_of_day.to_i - @end_timestamp = @end_date.end_of_day.to_i - - puts "๐Ÿ”ฌ Track Generation Optimization Benchmark" - puts "๐Ÿ‘ค User: #{user.email} (ID: #{user.id})" - puts "๐Ÿ“… Timeframe: #{start_date} to #{end_date}" - - check_data_availability - end - - def run_all_benchmarks - results = {} - - puts "\n" + "=" * 80 - puts "๐Ÿƒ RUNNING ALL BENCHMARKS" - puts "=" * 80 - - # Test Original approach - puts "\n1๏ธโƒฃ Testing ORIGINAL approach..." - results[:original] = benchmark_original - - # Test V1 approach - puts "\n2๏ธโƒฃ Testing V1 (LAG + Ruby) approach..." - results[:v1] = benchmark_v1 - - # Test V2 approach - puts "\n3๏ธโƒฃ Testing V2 (Full SQL) approach..." - results[:v2] = benchmark_v2 - - # Compare results - puts "\n" + "=" * 80 - puts "๐Ÿ“Š PERFORMANCE COMPARISON" - puts "=" * 80 - compare_results(results) - - # Save results to files - save_results_to_files(results) - - results - end - - private - - def check_data_availability - point_count = user.tracked_points.where(timestamp: start_timestamp..end_timestamp).count - existing_tracks = user.tracks.where(start_at: Time.zone.at(start_timestamp)..Time.zone.at(end_timestamp)).count - - puts "๐Ÿ“Š Dataset: #{point_count.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} points" - puts "๐Ÿ›ค๏ธ Existing tracks: #{existing_tracks}" - - if point_count == 0 - puts "โŒ No points found in timeframe" - exit 1 - end - - if point_count > 50000 - puts "โš ๏ธ Large dataset detected. This benchmark may take a while..." - end - end - - def benchmark_original - puts " Using standard Tracks::Generator..." - - # Clean existing tracks - cleanup_tracks - - # Monitor performance - memory_start = get_memory_mb - query_monitor = QueryMonitor.new - query_monitor.start - - start_time = Time.current - - begin - generator = Tracks::Generator.new( - user, - start_at: Time.zone.at(start_timestamp), - end_at: Time.zone.at(end_timestamp), - mode: :bulk - ) - tracks_created = generator.call - success = true - rescue => e - success = false - error = e.message - tracks_created = 0 - end - - end_time = Time.current - memory_end = get_memory_mb - query_monitor.stop - - execution_time = end_time - start_time - - result = { - approach: "Original", - success: success, - error: error, - execution_time: execution_time, - tracks_created: tracks_created, - memory_increase: memory_end - memory_start, - query_count: query_monitor.query_count, - query_time_ms: query_monitor.total_time_ms - } - - print_result(result) - result - end - - def benchmark_v1 - puts " Using V1: LAG + Ruby segmentation..." - - # Clean existing tracks - cleanup_tracks - - # For V1, we need to modify the existing generator to use our optimized methods - # This is a simplified test - in practice we'd modify the actual generator - - memory_start = get_memory_mb - query_monitor = QueryMonitor.new - query_monitor.start - - start_time = Time.current - - begin - # Load points - points = user.tracked_points - .where(timestamp: start_timestamp..end_timestamp) - .order(:timestamp) - - # V1: Use optimized segmentation with pre-calculated distances - if points.size > 1 - distance_data = Point.calculate_all_consecutive_distances(points) - else - distance_data = {} - end - - # Segment using V1 approach (simplified for benchmark) - segments = split_points_with_precalculated_distances(points, distance_data) - - tracks_created = 0 - segments.each do |segment| - if segment.size >= 2 - track = create_track_v1(segment) - tracks_created += 1 if track - end - end - - success = true - rescue => e - success = false - error = e.message - tracks_created = 0 - end - - end_time = Time.current - memory_end = get_memory_mb - query_monitor.stop - - execution_time = end_time - start_time - - result = { - approach: "V1 (LAG + Ruby)", - success: success, - error: error, - execution_time: execution_time, - tracks_created: tracks_created, - memory_increase: memory_end - memory_start, - query_count: query_monitor.query_count, - query_time_ms: query_monitor.total_time_ms - } - - print_result(result) - result - end - - def benchmark_v2 - puts " Using V2: Full SQL segmentation..." - - cleanup_tracks - - memory_start = get_memory_mb - query_monitor = QueryMonitor.new - query_monitor.start - - start_time = Time.current - - begin - generator = OptimizedTracksGeneratorV2.new( - user, - start_at: Time.zone.at(start_timestamp), - end_at: Time.zone.at(end_timestamp), - mode: :bulk - ) - tracks_created = generator.call - success = true - rescue => e - success = false - error = e.message - tracks_created = 0 - end - - end_time = Time.current - memory_end = get_memory_mb - query_monitor.stop - - execution_time = end_time - start_time - - result = { - approach: "V2 (Full SQL)", - success: success, - error: error, - execution_time: execution_time, - tracks_created: tracks_created, - memory_increase: memory_end - memory_start, - query_count: query_monitor.query_count, - query_time_ms: query_monitor.total_time_ms - } - - print_result(result) - result - end - - def split_points_with_precalculated_distances(points, distance_data) - return [] if points.empty? - - segments = [] - current_segment = [] - - points.each do |point| - if current_segment.empty? - current_segment = [point] - elsif should_break_segment_v1?(point, current_segment.last, distance_data) - segments << current_segment if current_segment.size >= 2 - current_segment = [point] - else - current_segment << point - end - end - - segments << current_segment if current_segment.size >= 2 - segments - end - - def should_break_segment_v1?(current_point, previous_point, distance_data) - return false if previous_point.nil? - - point_data = distance_data[current_point.id] - return false unless point_data - - time_threshold_seconds = user.safe_settings.minutes_between_routes.to_i * 60 - distance_threshold_meters = user.safe_settings.meters_between_routes.to_i - - return true if point_data[:time_diff_seconds] > time_threshold_seconds - return true if point_data[:distance_meters] > distance_threshold_meters - - false - end - - def create_track_v1(points) - return nil if points.size < 2 - - track = Track.new( - user_id: user.id, - start_at: Time.zone.at(points.first.timestamp), - end_at: Time.zone.at(points.last.timestamp), - original_path: build_path(points) - ) - - # Use LAG-based distance calculation - track.distance = Point.total_distance_lag(points, :m).round - track.duration = points.last.timestamp - points.first.timestamp - track.avg_speed = calculate_average_speed(track.distance, track.duration) - - # Elevation stats (same as original) - elevation_stats = calculate_elevation_stats(points) - track.elevation_gain = elevation_stats[:gain] - track.elevation_loss = elevation_stats[:loss] - track.elevation_max = elevation_stats[:max] - track.elevation_min = elevation_stats[:min] - - if track.save - Point.where(id: points.map(&:id)).update_all(track_id: track.id) - track - else - nil - end - end - - def cleanup_tracks - user.tracks.where(start_at: Time.zone.at(start_timestamp)..Time.zone.at(end_timestamp)).destroy_all - end - - def print_result(result) - status = result[:success] ? "โœ… SUCCESS" : "โŒ FAILED" - puts " #{status}" - puts " โฑ๏ธ Time: #{format_duration(result[:execution_time])}" - puts " ๐Ÿ›ค๏ธ Tracks: #{result[:tracks_created]}" - puts " ๐Ÿ’พ Memory: +#{result[:memory_increase].round(1)}MB" - puts " ๐Ÿ—„๏ธ Queries: #{result[:query_count]} (#{result[:query_time_ms].round(1)}ms)" - puts " โŒ Error: #{result[:error]}" if result[:error] - end - - def compare_results(results) - return unless results[:original] && results[:v1] && results[:v2] - - puts sprintf("%-20s %-10s %-12s %-10s %-15s %-10s", - "Approach", "Time", "Tracks", "Memory", "Queries", "Query Time") - puts "-" * 80 - - [:original, :v1, :v2].each do |approach| - result = results[approach] - next unless result[:success] - - puts sprintf("%-20s %-10s %-12s %-10s %-15s %-10s", - result[:approach], - format_duration(result[:execution_time]), - result[:tracks_created], - "+#{result[:memory_increase].round(1)}MB", - result[:query_count], - "#{result[:query_time_ms].round(1)}ms") - end - - # Calculate improvements - if results[:original][:success] - original_time = results[:original][:execution_time] - original_queries = results[:original][:query_count] - - puts "\n๐Ÿš€ Performance Improvements vs Original:" - - if results[:v1][:success] - v1_speedup = (original_time / results[:v1][:execution_time]).round(2) - v1_query_reduction = ((original_queries - results[:v1][:query_count]) / original_queries.to_f * 100).round(1) - puts " V1: #{v1_speedup}x faster, #{v1_query_reduction}% fewer queries" - end - - if results[:v2][:success] - v2_speedup = (original_time / results[:v2][:execution_time]).round(2) - v2_query_reduction = ((original_queries - results[:v2][:query_count]) / original_queries.to_f * 100).round(1) - puts " V2: #{v2_speedup}x faster, #{v2_query_reduction}% fewer queries" - end - end - end - - def save_results_to_files(results) - timestamp = Time.current.strftime('%Y%m%d_%H%M%S') - point_count = user.tracked_points.where(timestamp: start_timestamp..end_timestamp).count - - # Create detailed results structure - benchmark_data = { - meta: { - timestamp: Time.current.iso8601, - user_id: user.id, - user_email: user.email, - start_date: start_date.strftime('%Y-%m-%d'), - end_date: end_date.strftime('%Y-%m-%d'), - point_count: point_count, - ruby_version: RUBY_VERSION, - rails_version: Rails.version, - database_adapter: ActiveRecord::Base.connection.adapter_name - }, - results: results, - performance_analysis: analyze_performance_data(results) - } - - # Save JSON results for programmatic analysis - json_filename = "tracks_optimization_#{timestamp}.json" - json_path = Rails.root.join('lib', json_filename) - File.write(json_path, JSON.pretty_generate(benchmark_data)) - - # Save human-readable markdown report - md_filename = "tracks_optimization_#{timestamp}.md" - md_path = Rails.root.join('lib', md_filename) - File.write(md_path, generate_markdown_report(benchmark_data)) - - puts "\n๐Ÿ’พ Results saved:" - puts " ๐Ÿ“„ JSON: #{json_path}" - puts " ๐Ÿ“ Report: #{md_path}" - end - - def analyze_performance_data(results) - return {} unless results[:original] && results[:original][:success] - - original = results[:original] - analysis = { - baseline: { - execution_time: original[:execution_time], - query_count: original[:query_count], - memory_usage: original[:memory_increase] - } - } - - [:v1, :v2].each do |version| - next unless results[version] && results[version][:success] - - result = results[version] - analysis[version] = { - speedup_factor: (original[:execution_time] / result[:execution_time]).round(2), - query_reduction_percent: ((original[:query_count] - result[:query_count]) / original[:query_count].to_f * 100).round(1), - memory_change_percent: ((result[:memory_increase] - original[:memory_increase]) / original[:memory_increase].to_f * 100).round(1), - execution_time_saved: (original[:execution_time] - result[:execution_time]).round(2) - } - end - - analysis - end - - def generate_markdown_report(benchmark_data) - meta = benchmark_data[:meta] - results = benchmark_data[:results] - analysis = benchmark_data[:performance_analysis] - - report = <<~MD - # Tracks Generation Optimization Benchmark Report - - **Generated:** #{meta[:timestamp]} - **User:** #{meta[:user_email]} (ID: #{meta[:user_id]}) - **Timeframe:** #{meta[:start_date]} to #{meta[:end_date]} - **Dataset:** #{meta[:point_count].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} points - **Environment:** Ruby #{meta[:ruby_version]}, Rails #{meta[:rails_version]}, #{meta[:database_adapter]} - - ## Summary - - This benchmark compares three approaches to track generation: - - **Original:** Individual PostGIS queries for each distance calculation - - **V1 (LAG + Ruby):** PostgreSQL LAG for batch distance calculation, Ruby segmentation - - **V2 (Full SQL):** Complete segmentation using PostgreSQL window functions - - ## Results - - | Approach | Status | Time | Tracks | Memory | Queries | Query Time | - |----------|--------|------|--------|--------|---------|------------| - MD - - [:original, :v1, :v2].each do |approach| - next unless results[approach] - - result = results[approach] - status = result[:success] ? "โœ…" : "โŒ" - - report += "| #{result[:approach]} | #{status} | #{format_duration(result[:execution_time])} | #{result[:tracks_created]} | +#{result[:memory_increase].round(1)}MB | #{result[:query_count]} | #{result[:query_time_ms].round(1)}ms |\n" - end - - if analysis[:v1] || analysis[:v2] - report += "\n## Performance Improvements\n\n" - - if analysis[:v1] - v1 = analysis[:v1] - report += "### V1 (LAG + Ruby) vs Original\n" - report += "- **#{v1[:speedup_factor]}x faster** execution\n" - report += "- **#{v1[:query_reduction_percent]}% fewer** database queries\n" - report += "- **#{format_duration(v1[:execution_time_saved])} time saved**\n" - report += "- Memory change: #{v1[:memory_change_percent] > 0 ? '+' : ''}#{v1[:memory_change_percent]}%\n\n" - end - - if analysis[:v2] - v2 = analysis[:v2] - report += "### V2 (Full SQL) vs Original\n" - report += "- **#{v2[:speedup_factor]}x faster** execution\n" - report += "- **#{v2[:query_reduction_percent]}% fewer** database queries\n" - report += "- **#{format_duration(v2[:execution_time_saved])} time saved**\n" - report += "- Memory change: #{v2[:memory_change_percent] > 0 ? '+' : ''}#{v2[:memory_change_percent]}%\n\n" - end - end - - # Add detailed results - report += "## Detailed Results\n\n" - - [:original, :v1, :v2].each do |approach| - next unless results[approach] - - result = results[approach] - report += "### #{result[:approach]}\n\n" - - if result[:success] - report += "- โœ… **Status:** Success\n" - report += "- โฑ๏ธ **Execution Time:** #{format_duration(result[:execution_time])}\n" - report += "- ๐Ÿ›ค๏ธ **Tracks Created:** #{result[:tracks_created]}\n" - report += "- ๐Ÿ’พ **Memory Increase:** +#{result[:memory_increase].round(1)}MB\n" - report += "- ๐Ÿ—„๏ธ **Database Queries:** #{result[:query_count]}\n" - report += "- โšก **Query Time:** #{result[:query_time_ms].round(1)}ms\n" - - if result[:query_count] > 0 - avg_query_time = (result[:query_time_ms] / result[:query_count]).round(2) - report += "- ๐Ÿ“Š **Average Query Time:** #{avg_query_time}ms\n" - end - else - report += "- โŒ **Status:** Failed\n" - report += "- ๐Ÿšจ **Error:** #{result[:error]}\n" - end - - report += "\n" - end - - report += "## Recommendations\n\n" - - if analysis[:v2] && analysis[:v2][:speedup_factor] > analysis.dig(:v1, :speedup_factor).to_f - report += "๐Ÿš€ **V2 (Full SQL)** shows the best performance with #{analysis[:v2][:speedup_factor]}x speedup.\n\n" - report += "Benefits:\n" - report += "- Minimal database queries (#{results.dig(:v2, :query_count)} vs #{results.dig(:original, :query_count)})\n" - report += "- Fastest execution time\n" - report += "- Leverages PostgreSQL's optimized window functions\n\n" - elsif analysis[:v1] - report += "๐Ÿƒ **V1 (LAG + Ruby)** provides good performance improvements with #{analysis[:v1][:speedup_factor]}x speedup.\n\n" - end - - if results[:original] && results[:original][:query_count] > 50000 - report += "โš ๏ธ **Current implementation** makes excessive database queries (#{results[:original][:query_count]}) for this dataset size.\n\n" - end - - report += "---\n*Generated by TracksOptimizationBenchmark*" - - report - end - - # Helper methods - def get_memory_mb - `ps -o rss= -p #{Process.pid}`.to_i / 1024.0 - end - - def format_duration(seconds) - if seconds < 60 - "#{seconds.round(1)}s" - else - minutes = (seconds / 60).floor - remaining_seconds = (seconds % 60).round(1) - "#{minutes}m #{remaining_seconds}s" - end - end - - def build_path(points) - Tracks::BuildPath.new(points).call - end - - def calculate_average_speed(distance_in_meters, duration_seconds) - return 0.0 if duration_seconds <= 0 || distance_in_meters <= 0 - speed_mps = distance_in_meters.to_f / duration_seconds - (speed_mps * 3.6).round(2) - end - - def calculate_elevation_stats(points) - altitudes = points.map(&:altitude).compact - return { gain: 0, loss: 0, max: 0, min: 0 } if altitudes.empty? - - elevation_gain = 0 - elevation_loss = 0 - previous_altitude = altitudes.first - - altitudes[1..].each do |altitude| - diff = altitude - previous_altitude - if diff > 0 - elevation_gain += diff - else - elevation_loss += diff.abs - end - previous_altitude = altitude - end - - { gain: elevation_gain.round, loss: elevation_loss.round, max: altitudes.max, min: altitudes.min } - end -end - -# Simple query monitor for this benchmark -class QueryMonitor - attr_reader :query_count, :total_time_ms - - def initialize - @query_count = 0 - @total_time_ms = 0 - end - - def start - @subscription = ActiveSupport::Notifications.subscribe('sql.active_record') do |*args| - event = ActiveSupport::Notifications::Event.new(*args) - next if event.payload[:name]&.include?('SCHEMA') - - @query_count += 1 - @total_time_ms += event.duration - end - end - - def stop - ActiveSupport::Notifications.unsubscribe(@subscription) if @subscription - end -end - -# Command line interface -if __FILE__ == $0 - if ARGV.length < 3 - puts "Usage: rails runner #{__FILE__} USER_ID START_DATE END_DATE" - puts "" - puts "Example:" - puts " rails runner #{__FILE__} 1 2024-01-01 2024-01-31" - exit 1 - end - - user_id = ARGV[0].to_i - start_date = ARGV[1] - end_date = ARGV[2] - - benchmark = TracksOptimizationBenchmark.new(user_id, start_date, end_date) - results = benchmark.run_all_benchmarks - - puts "\n๐ŸŽ‰ Benchmark completed! Check results above." -end \ No newline at end of file diff --git a/tracks_performance_optimization_options.md b/tracks_performance_optimization_options.md deleted file mode 100644 index 8f7c6601..00000000 --- a/tracks_performance_optimization_options.md +++ /dev/null @@ -1,235 +0,0 @@ -# Tracks Feature Performance Optimization Options - -## Current State Analysis - -### Performance Characteristics -- **Time Complexity:** O(n log n) where n = number of GPS points -- **Memory Usage:** Loads entire dataset into memory (~200-400 bytes per point) -- **Processing Mode:** Single-threaded, sequential segmentation -- **Database Load:** Multiple PostGIS distance calculations per point pair - -### Performance Estimates (Bulk Mode) -| Points | Processing Time | Memory Usage | Database Load | -|--------|----------------|--------------|---------------| -| 10K | 30-60 seconds | ~50 MB | Low | -| 100K | 5-15 minutes | ~200 MB | Medium | -| 1M+ | 30-90 minutes | 400+ MB | High | - -### Current Bottlenecks -1. **Memory constraints** - Loading all points at once -2. **PostGIS distance calculations** - Sequential, not optimized -3. **Single-threaded processing** - No parallelization -4. **No progress indication** - Users can't track long-running operations - ---- - -## Optimization Options - -### Option 1: Enhanced Time-Based Batching -**Complexity:** Low | **Impact:** High | **Risk:** Low - -#### Implementation -- Extend existing `:daily` mode with configurable batch sizes -- Add 1-point overlap between batches to maintain segmentation accuracy -- Implement batch-aware progress reporting - -#### Benefits -- **Memory reduction:** 90%+ reduction (from 400MB to ~40MB for 1M points) -- **Better UX:** Progress indication and cancellation support -- **Incremental processing:** Can resume interrupted operations -- **Lower DB pressure:** Smaller query result sets - -#### Changes Required -```ruby -# Enhanced generator with configurable batching -Tracks::Generator.new( - user, - mode: :batched, - batch_size: 24.hours, - enable_overlap: true -).call -``` - -#### Edge Cases to Handle -- Tracks spanning batch boundaries (solved with overlap) -- Midnight-crossing tracks in daily mode -- Deduplication of overlapping segments - ---- - -### Option 2: Spatial Indexing Optimization -**Complexity:** Medium | **Impact:** Medium | **Risk:** Low - -#### Implementation -- Replace individual PostGIS calls with batch distance calculations -- Implement spatial clustering for nearby points before segmentation -- Use PostGIS window functions for distance calculations - -#### Benefits -- **Faster distance calculations:** Batch operations vs individual queries -- **Reduced DB round-trips:** Single query for multiple distance calculations -- **Better index utilization:** Leverage existing spatial indexes - -#### Changes Required -```sql --- Batch distance calculation approach -WITH point_distances AS ( - SELECT - id, - timestamp, - ST_Distance( - lonlat::geography, - LAG(lonlat::geography) OVER (ORDER BY timestamp) - ) as distance_to_previous - FROM points - WHERE user_id = ? - ORDER BY timestamp -) -SELECT * FROM point_distances WHERE distance_to_previous > ? -``` - ---- - -### Option 3: Parallel Processing with Worker Pools -**Complexity:** High | **Impact:** High | **Risk:** Medium - -#### Implementation -- Split large datasets into non-overlapping time ranges -- Process multiple batches in parallel using Sidekiq workers -- Implement coordination mechanism for dependent segments - -#### Benefits -- **Faster processing:** Utilize multiple CPU cores -- **Scalable:** Performance scales with worker capacity -- **Background processing:** Non-blocking for users - -#### Challenges -- **Complex coordination:** Managing dependencies between batches -- **Resource competition:** Multiple workers accessing same user's data -- **Error handling:** Partial failure scenarios - -#### Architecture -```ruby -# Parallel processing coordinator -class Tracks::ParallelGenerator - def call - time_ranges = split_into_parallel_ranges - - time_ranges.map do |range| - Tracks::BatchProcessorJob.perform_later(user_id, range) - end - end -end -``` - ---- - -### Option 4: Incremental Algorithm Enhancement -**Complexity:** Medium | **Impact:** Medium | **Risk:** Medium - -#### Implementation -- Enhance existing `:incremental` mode with smarter buffering -- Implement sliding window approach for active track detection -- Add automatic track finalization based on time gaps - -#### Benefits -- **Real-time processing:** Process points as they arrive -- **Lower memory footprint:** Only active segments in memory -- **Better for live tracking:** Immediate track updates - -#### Current Limitations -- Existing incremental mode processes untracked points only -- No automatic track finalization -- Limited to single active track per user - ---- - -### Option 5: Database-Level Optimization -**Complexity:** Low-Medium | **Impact:** Medium | **Risk:** Low - -#### Implementation -- Add composite indexes for common query patterns -- Implement materialized views for expensive calculations -- Use database-level segmentation logic - -#### Benefits -- **Faster queries:** Better index utilization -- **Reduced Ruby processing:** Move logic to database -- **Consistent performance:** Database optimizations benefit all modes - -#### Proposed Indexes -```sql --- Optimized for bulk processing -CREATE INDEX CONCURRENTLY idx_points_user_timestamp_track -ON points(user_id, timestamp) WHERE track_id IS NULL; - --- Optimized for incremental processing -CREATE INDEX CONCURRENTLY idx_points_untracked_timestamp -ON points(timestamp) WHERE track_id IS NULL; -``` - ---- - -## Recommended Implementation Strategy - -### Phase 1: Quick Wins (Week 1-2) -1. **Implement Enhanced Time-Based Batching** (Option 1) - - Extend existing daily mode with overlap - - Add progress reporting - - Configurable batch sizes - -### Phase 2: Database Optimization (Week 3) -2. **Add Database-Level Optimizations** (Option 5) - - Create optimized indexes - - Implement batch distance calculations - -### Phase 3: Advanced Features (Week 4-6) -3. **Spatial Indexing Optimization** (Option 2) - - Replace individual distance calculations - - Implement spatial clustering - -### Phase 4: Future Enhancements -4. **Parallel Processing** (Option 3) - Consider for v2 -5. **Incremental Enhancement** (Option 4) - For real-time features - ---- - -## Risk Assessment - -### Low Risk -- **Time-based batching:** Builds on existing daily mode -- **Database indexes:** Standard optimization technique -- **Progress reporting:** UI enhancement only - -### Medium Risk -- **Spatial optimization:** Requires careful testing of distance calculations -- **Incremental enhancement:** Changes to existing algorithm logic - -### High Risk -- **Parallel processing:** Complex coordination, potential race conditions -- **Major algorithm changes:** Could introduce segmentation bugs - ---- - -## Success Metrics - -### Performance Targets -- **Memory usage:** < 100MB for datasets up to 1M points -- **Processing time:** < 10 minutes for 1M points -- **User experience:** Progress indication and cancellation - -### Monitoring Points -- Database query performance -- Memory consumption during processing -- User-reported processing times -- Track generation accuracy (no regression) - ---- - -## Next Steps - -1. **Choose initial approach** based on urgency and resources -2. **Create feature branch** for selected optimization -3. **Implement comprehensive testing** including edge cases -4. **Monitor performance** in staging environment -5. **Gradual rollout** with feature flags \ No newline at end of file