Skip to content

Commit 4ba7451

Browse files
authored
FIX: Make sure limits are updated and applied on each step (#1002)
1 parent 50f6192 commit 4ba7451

File tree

1 file changed

+22
-23
lines changed

1 file changed

+22
-23
lines changed

app/jobs/scheduled/embeddings_backfill.rb

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,14 @@ def execute(args)
3131
.where(archetype: Archetype.default)
3232
.where(deleted_at: nil)
3333
.order("topics.bumped_at DESC")
34-
.limit(limit - rebaked)
3534

36-
rebaked += populate_topic_embeddings(vector_rep, topics)
35+
rebaked += populate_topic_embeddings(vector_rep, topics.limit(limit - rebaked))
3736

3837
return if rebaked >= limit
3938

4039
# Then, we'll try to backfill embeddings for topics that have outdated
4140
# embeddings, be it model or strategy version
42-
relation = topics.where(<<~SQL)
41+
relation = topics.where(<<~SQL).limit(limit - rebaked)
4342
#{table_name}.model_version < #{vector_rep.version}
4443
OR
4544
#{table_name}.strategy_version < #{strategy.version}
@@ -65,20 +64,22 @@ def execute(args)
6564

6665
# Now for posts
6766
table_name = vector_rep.post_table_name
67+
posts_batch_size = 1000
6868

6969
posts =
7070
Post
7171
.joins("LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id")
7272
.where(deleted_at: nil)
7373
.where(post_type: Post.types[:regular])
74-
.limit(limit - rebaked)
7574

7675
# First, we'll try to backfill embeddings for posts that have none
7776
posts
7877
.where("#{table_name}.post_id IS NULL")
79-
.find_in_batches do |batch|
80-
vector_rep.gen_bulk_reprensentations(batch)
81-
rebaked += batch.size
78+
.limit(limit - rebaked)
79+
.pluck(:id)
80+
.each_slice(posts_batch_size) do |batch|
81+
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
82+
rebaked += batch.length
8283
end
8384

8485
return if rebaked >= limit
@@ -91,28 +92,26 @@ def execute(args)
9192
OR
9293
#{table_name}.strategy_version < #{strategy.version}
9394
SQL
94-
.find_in_batches do |batch|
95-
vector_rep.gen_bulk_reprensentations(batch)
96-
rebaked += batch.size
95+
.limit(limit - rebaked)
96+
.pluck(:id)
97+
.each_slice(posts_batch_size) do |batch|
98+
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
99+
rebaked += batch.length
97100
end
98101

99102
return if rebaked >= limit
100103

101104
# Finally, we'll try to backfill embeddings for posts that have outdated
102105
# embeddings due to edits. Here we only do 10% of the limit
103-
posts_batch_size = 1000
104-
105-
outdated_post_ids =
106-
posts
107-
.where("#{table_name}.updated_at < ?", 7.days.ago)
108-
.order("random()")
109-
.limit((limit - rebaked) / 10)
110-
.pluck(:id)
111-
112-
outdated_post_ids.each_slice(posts_batch_size) do |batch|
113-
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
114-
rebaked += batch.length
115-
end
106+
posts
107+
.where("#{table_name}.updated_at < ?", 7.days.ago)
108+
.order("random()")
109+
.limit((limit - rebaked) / 10)
110+
.pluck(:id)
111+
.each_slice(posts_batch_size) do |batch|
112+
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
113+
rebaked += batch.length
114+
end
116115

117116
rebaked
118117
end

0 commit comments

Comments
 (0)