diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 000000000..650de3e30 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,8 @@ +{ + "permissions": { + "allow": [ + "Bash(bundle exec rails g migration:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/app/jobs/regular/generate_inferred_concepts.rb b/app/jobs/regular/generate_inferred_concepts.rb new file mode 100644 index 000000000..06b0a7500 --- /dev/null +++ b/app/jobs/regular/generate_inferred_concepts.rb @@ -0,0 +1,69 @@ +# frozen_string_literal: true + +module Jobs + class GenerateInferredConcepts < ::Jobs::Base + sidekiq_options queue: "low" + + # Process items to generate new concepts + # + # @param args [Hash] Contains job arguments + # @option args [String] :item_type Required - Type of items to process ('topics' or 'posts') + # @option args [Array] :item_ids Required - List of item IDs to process + # @option args [Integer] :batch_size (100) Number of items to process in each batch + # @option args [Boolean] :match_only (false) Only match against existing concepts without generating new ones + def execute(args = {}) + return if args[:item_ids].blank? || args[:item_type].blank? + + if %w[topics posts].exclude?(args[:item_type]) + Rails.logger.error("Invalid item_type for GenerateInferredConcepts: #{args[:item_type]}") + return + end + + # Process items in smaller batches to avoid memory issues + batch_size = args[:batch_size] || 100 + + # Get the list of item IDs + item_ids = args[:item_ids] + match_only = args[:match_only] || false + + # Process items in batches + item_ids.each_slice(batch_size) do |batch_item_ids| + process_batch(batch_item_ids, args[:item_type], match_only) + end + end + + private + + def process_batch(item_ids, item_type, match_only) + klass = item_type.singularize.classify.constantize + items = klass.where(id: item_ids) + + items.each do |item| + begin + process_item(item, item_type, match_only) + rescue => e + Rails.logger.error( + "Error generating concepts from #{item_type.singularize} #{item.id}: #{e.message}\n#{e.backtrace.join("\n")}", + ) + end + end + end + + def process_item(item, item_type, match_only) + # Use the Manager method that handles both identifying and creating concepts + if match_only + if item_type == "topics" + DiscourseAi::InferredConcepts::Manager.match_topic_to_concepts(item) + else # posts + DiscourseAi::InferredConcepts::Manager.match_post_to_concepts(item) + end + else + if item_type == "topics" + DiscourseAi::InferredConcepts::Manager.analyze_topic(item) + else # posts + DiscourseAi::InferredConcepts::Manager.analyze_post(item) + end + end + end + end +end diff --git a/app/jobs/scheduled/generate_concepts_from_popular_items.rb b/app/jobs/scheduled/generate_concepts_from_popular_items.rb new file mode 100644 index 000000000..74f294846 --- /dev/null +++ b/app/jobs/scheduled/generate_concepts_from_popular_items.rb @@ -0,0 +1,85 @@ +# frozen_string_literal: true + +module Jobs + class GenerateConceptsFromPopularItems < ::Jobs::Scheduled + every 1.day + + # This job runs daily and generates new concepts from popular topics and posts + # It selects items based on engagement metrics and generates concepts from their content + def execute(_args) + return unless SiteSetting.inferred_concepts_enabled + + process_popular_topics + process_popular_posts + end + + private + + def process_popular_topics + # Find candidate topics that are popular and don't have concepts yet + candidates = + DiscourseAi::InferredConcepts::Manager.find_candidate_topics( + limit: SiteSetting.inferred_concepts_daily_topics_limit || 20, + min_posts: SiteSetting.inferred_concepts_min_posts || 5, + min_likes: SiteSetting.inferred_concepts_min_likes || 10, + min_views: SiteSetting.inferred_concepts_min_views || 100, + created_after: SiteSetting.inferred_concepts_lookback_days.days.ago, + ) + + return if candidates.blank? + + # Process candidate topics - first generate concepts, then match + Jobs.enqueue( + :generate_inferred_concepts, + item_type: "topics", + item_ids: candidates.map(&:id), + batch_size: 10, + ) + + if SiteSetting.inferred_concepts_background_match + # Schedule a follow-up job to match existing concepts + Jobs.enqueue_in( + 1.hour, + :generate_inferred_concepts, + item_type: "topics", + item_ids: candidates.map(&:id), + batch_size: 10, + match_only: true, + ) + end + end + + def process_popular_posts + # Find candidate posts that are popular and don't have concepts yet + candidates = + DiscourseAi::InferredConcepts::Manager.find_candidate_posts( + limit: SiteSetting.inferred_concepts_daily_posts_limit || 30, + min_likes: SiteSetting.inferred_concepts_post_min_likes || 5, + exclude_first_posts: true, + created_after: SiteSetting.inferred_concepts_lookback_days.days.ago, + ) + + return if candidates.blank? + + # Process candidate posts - first generate concepts, then match + Jobs.enqueue( + :generate_inferred_concepts, + item_type: "posts", + item_ids: candidates.map(&:id), + batch_size: 10, + ) + + if SiteSetting.inferred_concepts_background_match + # Schedule a follow-up job to match against existing concepts + Jobs.enqueue_in( + 1.hour, + :generate_inferred_concepts, + item_type: "posts", + item_ids: candidates.map(&:id), + batch_size: 10, + match_only: true, + ) + end + end + end +end diff --git a/app/models/inferred_concept.rb b/app/models/inferred_concept.rb new file mode 100644 index 000000000..0248277fa --- /dev/null +++ b/app/models/inferred_concept.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +class InferredConcept < ActiveRecord::Base + has_and_belongs_to_many :topics + has_and_belongs_to_many :posts + + validates :name, presence: true, uniqueness: true +end + +# == Schema Information +# +# Table name: inferred_concepts +# +# id :bigint not null, primary key +# name :string not null +# created_at :datetime not null +# updated_at :datetime not null +# +# Indexes +# +# index_inferred_concepts_on_name (name) UNIQUE +# \ No newline at end of file diff --git a/app/serializers/ai_inferred_concept_post_serializer.rb b/app/serializers/ai_inferred_concept_post_serializer.rb new file mode 100644 index 000000000..d4bfcd628 --- /dev/null +++ b/app/serializers/ai_inferred_concept_post_serializer.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +class AiInferredConceptPostSerializer < ApplicationSerializer + attributes :id, + :post_number, + :topic_id, + :topic_title, + :username, + :avatar_template, + :created_at, + :updated_at, + :excerpt, + :truncated, + :inferred_concepts + + def avatar_template + User.avatar_template(object.username, object.uploaded_avatar_id) + end + + def excerpt + Post.excerpt(object.cooked) + end + + def truncated + object.cooked.length > SiteSetting.post_excerpt_maxlength + end + + def inferred_concepts + ActiveModel::ArraySerializer.new( + object.inferred_concepts, + each_serializer: InferredConceptSerializer + ) + end +end \ No newline at end of file diff --git a/app/serializers/inferred_concept_serializer.rb b/app/serializers/inferred_concept_serializer.rb new file mode 100644 index 000000000..265fe858c --- /dev/null +++ b/app/serializers/inferred_concept_serializer.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +class InferredConceptSerializer < ApplicationSerializer + attributes :id, :name, :created_at, :updated_at +end \ No newline at end of file diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 3e4c10642..390da18e8 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -326,6 +326,12 @@ en: short_summarizer: name: "Summarizer (short form)" description: "Default persona used to power AI short summaries for topic lists' items" + concept_finder: + name: "Concept Finder" + description: "AI Bot specialized in identifying concepts and themes in content" + concept_matcher: + name: "Concept Matcher" + description: "AI Bot specialized in matching content against existing concepts" topic_not_found: "Summary unavailable, topic not found!" summarizing: "Summarizing topic" searching: "Searching for: '%{query}'" diff --git a/config/settings.yml b/config/settings.yml index c2b62d599..4c10e45d6 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -401,3 +401,43 @@ discourse_ai: allow_any: false enum: "DiscourseAi::Configuration::LlmEnumerator" validator: "DiscourseAi::Configuration::LlmValidator" + + inferred_concepts_enabled: + default: false + client: true + inferred_concepts_background_match: + default: false + client: false + inferred_concepts_daily_topics_limit: + default: 20 + client: false + inferred_concepts_min_posts: + default: 5 + client: false + inferred_concepts_min_likes: + default: 10 + client: false + inferred_concepts_min_views: + default: 100 + client: false + inferred_concepts_lookback_days: + default: 30 + client: false + inferred_concepts_daily_posts_limit: + default: 30 + client: false + inferred_concepts_post_min_likes: + default: 5 + client: false + inferred_concepts_generate_persona: + default: "-15" + type: enum + enum: "DiscourseAi::Configuration::PersonaEnumerator" + inferred_concepts_match_persona: + default: "-16" + type: enum + enum: "DiscourseAi::Configuration::PersonaEnumerator" + inferred_concepts_deduplicate_persona: + default: "-17" + type: enum + enum: "DiscourseAi::Configuration::PersonaEnumerator" diff --git a/db/migrate/20250508182047_create_inferred_concepts_table.rb b/db/migrate/20250508182047_create_inferred_concepts_table.rb new file mode 100644 index 000000000..6686c040d --- /dev/null +++ b/db/migrate/20250508182047_create_inferred_concepts_table.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true +class CreateInferredConceptsTable < ActiveRecord::Migration[7.2] + def change + create_table :inferred_concepts do |t| + t.string :name, null: false + t.timestamps + end + + add_index :inferred_concepts, :name, unique: true + end +end diff --git a/db/migrate/20250508183456_create_inferred_concepts_topics.rb b/db/migrate/20250508183456_create_inferred_concepts_topics.rb new file mode 100644 index 000000000..71a75570d --- /dev/null +++ b/db/migrate/20250508183456_create_inferred_concepts_topics.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +class CreateInferredConceptsTopics < ActiveRecord::Migration[7.0] + def change + create_table :inferred_concepts_topics, id: false do |t| + t.belongs_to :inferred_concept + t.belongs_to :topic + t.timestamps + end + end +end diff --git a/db/migrate/20250509000001_create_inferred_concepts_posts.rb b/db/migrate/20250509000001_create_inferred_concepts_posts.rb new file mode 100644 index 000000000..2c17c441c --- /dev/null +++ b/db/migrate/20250509000001_create_inferred_concepts_posts.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +class CreateInferredConceptsPosts < ActiveRecord::Migration[7.0] + def change + create_table :inferred_concepts_posts, id: false do |t| + t.belongs_to :inferred_concept + t.belongs_to :post + t.timestamps + end + end +end diff --git a/lib/inferred_concepts/applier.rb b/lib/inferred_concepts/applier.rb new file mode 100644 index 000000000..c73bb4c85 --- /dev/null +++ b/lib/inferred_concepts/applier.rb @@ -0,0 +1,134 @@ +# frozen_string_literal: true + +module DiscourseAi + module InferredConcepts + class Applier + # Associates the provided concepts with a topic + # topic: a Topic instance + # concepts: an array of InferredConcept instances + def self.apply_to_topic(topic, concepts) + return if topic.blank? || concepts.blank? + + topic.inferred_concepts << concepts + end + + # Associates the provided concepts with a post + # post: a Post instance + # concepts: an array of InferredConcept instances + def self.apply_to_post(post, concepts) + return if post.blank? || concepts.blank? + + post.inferred_concepts << concepts + end + + # Extracts content from a topic for concept analysis + # Returns a string with the topic title and first few posts + def self.topic_content_for_analysis(topic) + return "" if topic.blank? + + # Combine title and first few posts for analysis + posts = Post.where(topic_id: topic.id).order(:post_number).limit(10) + + content = "Title: #{topic.title}\n\n" + content += posts.map { |p| "#{p.post_number}) #{p.user.username}: #{p.raw}" }.join("\n\n") + + content + end + + # Extracts content from a post for concept analysis + # Returns a string with the post content + def self.post_content_for_analysis(post) + return "" if post.blank? + + # Get the topic title for context + topic_title = post.topic&.title || "" + + content = "Topic: #{topic_title}\n\n" + content += "Post by #{post.user.username}:\n#{post.raw}" + + content + end + + # Match a topic with existing concepts + def self.match_existing_concepts(topic) + return [] if topic.blank? + + # Get content to analyze + content = topic_content_for_analysis(topic) + + # Get all existing concepts + existing_concepts = DiscourseAi::InferredConcepts::Manager.list_concepts + return [] if existing_concepts.empty? + + # Use the ConceptMatcher persona to match concepts + matched_concept_names = match_concepts_to_content(content, existing_concepts) + + # Find concepts in the database + matched_concepts = InferredConcept.where(name: matched_concept_names) + + # Apply concepts to the topic + apply_to_topic(topic, matched_concepts) + + matched_concepts + end + + # Match a post with existing concepts + def self.match_existing_concepts_for_post(post) + return [] if post.blank? + + # Get content to analyze + content = post_content_for_analysis(post) + + # Get all existing concepts + existing_concepts = DiscourseAi::InferredConcepts::Manager.list_concepts + return [] if existing_concepts.empty? + + # Use the ConceptMatcher persona to match concepts + matched_concept_names = match_concepts_to_content(content, existing_concepts) + + # Find concepts in the database + matched_concepts = InferredConcept.where(name: matched_concept_names) + + # Apply concepts to the post + apply_to_post(post, matched_concepts) + + matched_concepts + end + + # Use ConceptMatcher persona to match content against provided concepts + def self.match_concepts_to_content(content, concept_list) + return [] if content.blank? || concept_list.blank? + + # Prepare user message with only the content + user_message = content + + # Use the ConceptMatcher persona to match concepts + + persona = + AiPersona + .all_personas(enabled_only: false) + .find { |persona| persona.id == SiteSetting.inferred_concepts_match_persona.to_i } + .new + + llm = LlmModel.find(persona.class.default_llm_id) + + input = { type: :user, content: content } + + context = + DiscourseAi::Personas::BotContext.new( + messages: [input], + user: Discourse.system_user, + inferred_concepts: concept_list, + ) + + bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm) + + response = bot.reply(context) + + matching_concepts = JSON.parse(response[0][0]).dig("matching_concepts") + + matching_concepts || [] + end + end + end +end diff --git a/lib/inferred_concepts/finder.rb b/lib/inferred_concepts/finder.rb new file mode 100644 index 000000000..38d9e367d --- /dev/null +++ b/lib/inferred_concepts/finder.rb @@ -0,0 +1,171 @@ +# frozen_string_literal: true + +module DiscourseAi + module InferredConcepts + class Finder + # Identifies potential concepts from provided content + # Returns an array of concept names (strings) + def self.identify_concepts(content) + return [] if content.blank? + + # Use the ConceptFinder persona to identify concepts + persona = + AiPersona + .all_personas(enabled_only: false) + .find { |persona| persona.id == SiteSetting.inferred_concepts_generate_persona.to_i } + .new + + llm = LlmModel.find(persona.class.default_llm_id) + context = + DiscourseAi::Personas::BotContext.new( + messages: [{ type: :user, content: content }], + user: Discourse.system_user, + inferred_concepts: DiscourseAi::InferredConcepts::Manager.list_concepts, + ) + + bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm) + + response = bot.reply(context) + + concepts = JSON.parse(response[0][0]).dig("concepts") + concepts || [] + end + + # Creates or finds concepts in the database from provided names + # Returns an array of InferredConcept instances + def self.create_or_find_concepts(concept_names) + return [] if concept_names.blank? + + concept_names.map { |name| InferredConcept.find_or_create_by(name: name) } + end + + # Finds candidate topics to use for concept generation + # + # @param limit [Integer] Maximum number of topics to return + # @param min_posts [Integer] Minimum number of posts in topic + # @param min_likes [Integer] Minimum number of likes across all posts + # @param min_views [Integer] Minimum number of views + # @param exclude_topic_ids [Array] Topic IDs to exclude + # @param category_ids [Array] Only include topics from these categories (optional) + # @param created_after [DateTime] Only include topics created after this time (optional) + # @return [Array] Array of Topic objects that are good candidates + def self.find_candidate_topics( + limit: 100, + min_posts: 5, + min_likes: 10, + min_views: 100, + exclude_topic_ids: [], + category_ids: nil, + created_after: 30.days.ago + ) + query = + Topic.where( + "topics.posts_count >= ? AND topics.views >= ? AND topics.like_count >= ?", + min_posts, + min_views, + min_likes, + ) + + # Apply additional filters + query = query.where("topics.id NOT IN (?)", exclude_topic_ids) if exclude_topic_ids.present? + query = query.where("topics.category_id IN (?)", category_ids) if category_ids.present? + query = query.where("topics.created_at >= ?", created_after) if created_after.present? + + # Exclude PM topics (if they exist in Discourse) + query = query.where(archetype: Archetype.default) + + # Exclude topics that already have concepts + topics_with_concepts = <<~SQL + SELECT DISTINCT topic_id + FROM topics_inferred_concepts + SQL + + query = query.where("topics.id NOT IN (#{topics_with_concepts})") + + # Score and order topics by engagement (combination of views, likes, and posts) + query = + query.select( + "topics.*, + (topics.like_count * 2 + topics.posts_count * 3 + topics.views * 0.1) AS engagement_score", + ).order("engagement_score DESC") + + # Return limited number of topics + query.limit(limit) + end + + # Find candidate posts that are good for concept generation + # + # @param limit [Integer] Maximum number of posts to return + # @param min_likes [Integer] Minimum number of likes + # @param exclude_first_posts [Boolean] Exclude first posts in topics + # @param exclude_post_ids [Array] Post IDs to exclude + # @param category_ids [Array] Only include posts from topics in these categories + # @param created_after [DateTime] Only include posts created after this time + # @return [Array] Array of Post objects that are good candidates + def self.find_candidate_posts( + limit: 100, + min_likes: 5, + exclude_first_posts: true, + exclude_post_ids: [], + category_ids: nil, + created_after: 30.days.ago + ) + query = Post.where("posts.like_count >= ?", min_likes) + + # Exclude first posts if specified + query = query.where("posts.post_number > 1") if exclude_first_posts + + # Apply additional filters + query = query.where("posts.id NOT IN (?)", exclude_post_ids) if exclude_post_ids.present? + query = query.where("posts.created_at >= ?", created_after) if created_after.present? + + # Filter by category if specified + if category_ids.present? + query = query.joins(:topic).where("topics.category_id IN (?)", category_ids) + end + + # Exclude posts that already have concepts + posts_with_concepts = <<~SQL + SELECT DISTINCT post_id + FROM posts_inferred_concepts + SQL + + query = query.where("posts.id NOT IN (#{posts_with_concepts})") + + # Order by engagement (likes) + query = query.order(like_count: :desc) + + # Return limited number of posts + query.limit(limit) + end + + # Deduplicate and standardize a list of concepts + # @param concept_names [Array] List of concept names to deduplicate + # @return [Hash] Hash with deduplicated concepts and mapping + def self.deduplicate_concepts(concept_names) + return { deduplicated_concepts: [], mapping: {} } if concept_names.blank? + + # Use the ConceptDeduplicator persona to deduplicate concepts + persona = + AiPersona + .all_personas(enabled_only: false) + .find { |persona| persona.id == SiteSetting.inferred_concepts_deduplicate_persona.to_i } + .new + + llm = LlmModel.find(persona.class.default_llm_id) + + # Create the input for the deduplicator + input = { type: :user, content: concept_names.join(", ") } + + context = + DiscourseAi::Personas::BotContext.new(messages: [input], user: Discourse.system_user) + + bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm) + + response = bot.reply(context) + + concepts = JSON.parse(response[0][0]).dig("streamlined_tags") + end + end + end +end diff --git a/lib/inferred_concepts/manager.rb b/lib/inferred_concepts/manager.rb new file mode 100644 index 000000000..f1e1e061a --- /dev/null +++ b/lib/inferred_concepts/manager.rb @@ -0,0 +1,194 @@ +# frozen_string_literal: true + +module DiscourseAi + module InferredConcepts + class Manager + # Get a list of existing concepts + # @param limit [Integer, nil] Optional maximum number of concepts to return + # @return [Array] Array of InferredConcept objects + def self.list_concepts(limit: nil) + query = InferredConcept.all.order("name ASC") + + # Apply limit if provided + query = query.limit(limit) if limit.present? + + query.pluck(:name) + end + + # Deduplicate concepts in batches by letter + # This method will: + # 1. Group concepts by first letter + # 2. Process each letter group separately through the deduplicator + # 3. Do a final pass with all deduplicated concepts + # @return [Hash] Statistics about the deduplication process + def self.deduplicate_concepts_by_letter(per_letter_batch: 50, full_pass_batch: 150) + # Get all concepts + all_concepts = list_concepts + return if all_concepts.empty? + + letter_groups = Hash.new { |h, k| h[k] = [] } + + # Group concepts by first letter + all_concepts.each do |concept| + first_char = concept[0]&.upcase + + if first_char && first_char.match?(/[A-Z]/) + letter_groups[first_char] << concept + else + # Non-alphabetic or empty concepts go in a special group + letter_groups["#"] << concept + end + end + + # Process each letter group + letter_deduplicated_concepts = [] + + letter_groups.each do |letter, concepts| + next if concepts.empty? + + batches = concepts.each_slice(per_letter_batch).to_a + + batches.each do |batch| + result = Finder.deduplicate_concepts(batch) + letter_deduplicated_concepts.concat(result) + end + end + + # Final pass with all deduplicated concepts + if letter_deduplicated_concepts.present? + final_result = [] + + batches = letter_deduplicated_concepts.each_slice(full_pass_batch).to_a + batches.each do |batch| + dedups = Finder.deduplicate_concepts(batch) + final_result.concat(dedups) + end + + # Remove duplicates + final_result.uniq! + + # Apply the deduplicated concepts + InferredConcept.where.not(name: final_result).destroy_all + InferredConcept.insert_all(final_result.map { { name: it } }) + end + end + + # Extract new concepts from arbitrary content + # @param content [String] The content to analyze + # @return [Array] The identified concept names + def self.identify_concepts(content) + Finder.identify_concepts(content) + end + + # Identify and create concepts from content without applying them to any topic + # @param content [String] The content to analyze + # @return [Array] The created or found concepts + def self.generate_concepts_from_content(content) + return [] if content.blank? + + # Identify concepts + concept_names = Finder.identify_concepts(content) + return [] if concept_names.blank? + + # Create or find concepts in the database + Finder.create_or_find_concepts(concept_names) + end + + # Generate concepts from a topic's content without applying them to the topic + # @param topic [Topic] A Topic instance + # @return [Array] The created or found concepts + def self.generate_concepts_from_topic(topic) + return [] if topic.blank? + + # Get content to analyze + content = Applier.topic_content_for_analysis(topic) + return [] if content.blank? + + # Generate concepts from the content + generate_concepts_from_content(content) + end + + # Generate concepts from a post's content without applying them to the post + # @param post [Post] A Post instance + # @return [Array] The created or found concepts + def self.generate_concepts_from_post(post) + return [] if post.blank? + + # Get content to analyze + content = Applier.post_content_for_analysis(post) + return [] if content.blank? + + # Generate concepts from the content + generate_concepts_from_content(content) + end + + # Match a topic against existing concepts + # @param topic [Topic] A Topic instance + # @return [Array] The concepts that were applied + def self.match_topic_to_concepts(topic) + return [] if topic.blank? + + Applier.match_existing_concepts(topic) + end + + # Match a post against existing concepts + # @param post [Post] A Post instance + # @return [Array] The concepts that were applied + def self.match_post_to_concepts(post) + return [] if post.blank? + + Applier.match_existing_concepts_for_post(post) + end + + # Find topics that have a specific concept + # @param concept_name [String] The name of the concept to search for + # @return [Array] Topics that have the specified concept + def self.search_topics_by_concept(concept_name) + concept = ::InferredConcept.find_by(name: concept_name) + return [] unless concept + concept.topics + end + + # Find posts that have a specific concept + # @param concept_name [String] The name of the concept to search for + # @return [Array] Posts that have the specified concept + def self.search_posts_by_concept(concept_name) + concept = ::InferredConcept.find_by(name: concept_name) + return [] unless concept + concept.posts + end + + # Match arbitrary content against existing concepts + # @param content [String] The content to analyze + # @return [Array] Names of matching concepts + def self.match_content_to_concepts(content) + existing_concepts = InferredConcept.all.pluck(:name) + return [] if existing_concepts.empty? + + Applier.match_concepts_to_content(content, existing_concepts) + end + + # Find candidate topics that are good for concept generation + # + # @param opts [Hash] Options to pass to the finder + # @option opts [Integer] :limit (100) Maximum number of topics to return + # @option opts [Integer] :min_posts (5) Minimum number of posts in topic + # @option opts [Integer] :min_likes (10) Minimum number of likes across all posts + # @option opts [Integer] :min_views (100) Minimum number of views + # @option opts [Array] :exclude_topic_ids ([]) Topic IDs to exclude + # @option opts [Array] :category_ids (nil) Only include topics from these categories + # @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time + # @return [Array] Array of Topic objects that are good candidates + def self.find_candidate_topics(opts = {}) + Finder.find_candidate_topics(**opts) + end + + # Find candidate posts that are good for concept generation + # @param opts [Hash] Options to pass to the finder + # @return [Array] Array of Post objects that are good candidates + def self.find_candidate_posts(opts = {}) + Finder.find_candidate_posts(**opts) + end + end + end +end diff --git a/lib/personas/bot.rb b/lib/personas/bot.rb index b6e852c51..0dd726df6 100644 --- a/lib/personas/bot.rb +++ b/lib/personas/bot.rb @@ -152,10 +152,12 @@ def reply(context, llm_args: {}, &update_blk) raw_context << partial current_thinking << partial end - elsif partial.is_a?(DiscourseAi::Completions::StructuredOutput) - update_blk.call(partial, nil, :structured_output) - else - update_blk.call(partial) + elsif update_blk.present? + if partial.is_a?(DiscourseAi::Completions::StructuredOutput) + update_blk.call(partial, nil, :structured_output) + else + update_blk.call(partial) + end end end end diff --git a/lib/personas/bot_context.rb b/lib/personas/bot_context.rb index 69d86669a..8ee814041 100644 --- a/lib/personas/bot_context.rb +++ b/lib/personas/bot_context.rb @@ -17,7 +17,8 @@ class BotContext :context_post_ids, :feature_name, :resource_url, - :cancel_manager + :cancel_manager, + :inferred_concepts def initialize( post: nil, @@ -35,7 +36,8 @@ def initialize( context_post_ids: nil, feature_name: "bot", resource_url: nil, - cancel_manager: nil + cancel_manager: nil, + inferred_concepts: [] ) @participants = participants @user = user @@ -54,7 +56,7 @@ def initialize( @resource_url = resource_url @feature_name = feature_name - @resource_url = resource_url + @inferred_concepts = inferred_concepts @cancel_manager = cancel_manager @@ -68,7 +70,15 @@ def initialize( end # these are strings that can be safely interpolated into templates - TEMPLATE_PARAMS = %w[time site_url site_title site_description participants resource_url] + TEMPLATE_PARAMS = %w[ + time + site_url + site_title + site_description + participants + resource_url + inferred_concepts + ] def lookup_template_param(key) public_send(key.to_sym) if TEMPLATE_PARAMS.include?(key) @@ -114,6 +124,7 @@ def to_json skip_tool_details: @skip_tool_details, feature_name: @feature_name, resource_url: @resource_url, + inferred_concepts: @inferred_concepts, } end end diff --git a/lib/personas/concept_deduplicator.rb b/lib/personas/concept_deduplicator.rb new file mode 100644 index 000000000..4c6902f13 --- /dev/null +++ b/lib/personas/concept_deduplicator.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +module DiscourseAi + module Personas + class ConceptDeduplicator < Persona + def self.default_enabled + false + end + + def system_prompt + <<~PROMPT.strip + You will be given a list of machine-generated tags. + Your task is to streamline this list by merging entries who are similar or related. + + Please follow these steps to create a streamlined list of tags: + + 1. Review the entire list of tags carefully. + 2. Identify and remove any exact duplicates. + 3. Look for tags that are too specific or niche, and consider removing them or replacing them with more general terms. + 4. If there are multiple tags that convey similar concepts, choose the best one and remove the others, or add a new one that covers the missing aspect. + 5. Ensure that the remaining tags are relevant and useful for describing the content. + + When deciding which tags are "best", consider the following criteria: + - Relevance: How well does the tag describe the core content or theme? + - Generality: Is the tag specific enough to be useful, but not so specific that it's unlikely to be searched for? + - Clarity: Is the tag easy to understand and free from ambiguity? + - Popularity: Would this tag likely be used by people searching for this type of content? + + Example Input: + AI Bias, AI Bots, AI Ethics, AI Helper, AI Integration, AI Moderation, AI Search, AI-Driven Moderation, AI-Generated Post Illustrations, AJAX Events, AJAX Requests, AMA Events, API, API Access, API Authentication, API Automation, API Call, API Changes, API Compliance, API Configuration, API Costs, API Documentation, API Endpoint, API Endpoints, API Functions, API Integration, API Key, API Keys, API Limitation, API Limitations, API Permissions, API Rate Limiting, API Request, API Request Optimization, API Requests, API Security, API Suspension, API Token, API Tokens, API Translation, API Versioning, API configuration, API endpoint, API key, APIs, APK, APT Package Manager, ARIA, ARIA Tags, ARM Architecture, ARM-based, AWS, AWS Lightsail, AWS RDS, AWS S3, AWS Translate, AWS costs, AWS t2.micro, Abbreviation Expansion, Abbreviations + + Example Output: + AI, AJAX, API, APK, APT Package Manager, ARIA, ARM Architecture, AWS, Abbreviations + + Please provide your streamlined list of tags within key. + + Remember, the goal is to create a more focused and effective set of tags while maintaining the essence of the original list. + + Your output should be in the following format: + + { + "streamlined_tags": ["tag1", "tag3"] + } + + PROMPT + end + + def response_format + [{ "key" => "streamlined_tags", "type" => "array" }] + end + end + end +end diff --git a/lib/personas/concept_finder.rb b/lib/personas/concept_finder.rb new file mode 100644 index 000000000..912e42fe1 --- /dev/null +++ b/lib/personas/concept_finder.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +module DiscourseAi + module Personas + class ConceptFinder < Persona + def self.default_enabled + false + end + + def system_prompt + existing_concepts = DiscourseAi::InferredConcepts::Manager.list_concepts(limit: 100) + existing_concepts_text = "" + + existing_concepts_text = <<~CONCEPTS if existing_concepts.present? + The following concepts already exist in the system: + #{existing_concepts.join(", ")} + + You can reuse these existing concepts if they apply to the content, or suggest new concepts. + CONCEPTS + + <<~PROMPT.strip + You are an advanced concept tagging system that identifies key concepts, themes, and topics from provided text. + Your job is to extract meaningful labels that can be used to categorize content. + + Guidelines for generating concepts: + - Extract up to 7 concepts from the provided content + - Concepts should be single words or short phrases (1-3 words maximum) + - Focus on substantive topics, themes, technologies, methodologies, or domains + - Avoid overly general terms like "discussion" or "question" + - Ensure concepts are relevant to the core content + - Do not include proper nouns unless they represent key technologies or methodologies + - Maintain the original language of the text being analyzed + #{existing_concepts_text} + Format your response as a JSON object with a single key named "concepts", which has an array of concept strings as the value. + Your output should be in the following format: + + {"concepts": ["concept1", "concept2", "concept3"]} + + + Where the concepts are replaced by the actual concepts you've identified. + PROMPT + end + + def response_format + [{ "key" => "concepts", "type" => "array" }] + end + end + end +end diff --git a/lib/personas/concept_matcher.rb b/lib/personas/concept_matcher.rb new file mode 100644 index 000000000..8cdcdb0fa --- /dev/null +++ b/lib/personas/concept_matcher.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +module DiscourseAi + module Personas + class ConceptMatcher < Persona + def self.default_enabled + false + end + + def system_prompt + <<~PROMPT.strip + You are an advanced concept matching system that determines which concepts from a provided list are relevant to a piece of content. + Your job is to analyze the content and determine which concepts from the list apply to it. + + Guidelines for matching concepts: + - Only select concepts that are clearly relevant to the content + - The content must substantially discuss or relate to the concept + - Superficial mentions are not enough to consider a concept relevant + - Be precise and selective - don't match concepts that are only tangentially related + - Consider both explicit mentions and implicit discussions of concepts + - Maintain the original language of the text being analyzed + - IMPORTANT: Only select from the exact concepts in the provided list - do not add new concepts + - If no concepts from the list match the content, return an empty array + + The list of available concepts is: + {inferred_concepts} + + Format your response as a JSON object with a single key named "matching_concepts", which has an array of concept strings from the provided list. + Your output should be in the following format: + + {"matching_concepts": ["concept1", "concept3", "concept5"]} + + + Only include concepts from the provided list that match the content. If no concepts match, return an empty array. + PROMPT + end + + def response_format + [{ "key" => "matching_concepts", "type" => "array" }] + end + end + end +end diff --git a/lib/personas/persona.rb b/lib/personas/persona.rb index 62426f77d..002e8f4e1 100644 --- a/lib/personas/persona.rb +++ b/lib/personas/persona.rb @@ -52,6 +52,9 @@ def system_personas ShortSummarizer => -12, Designer => -13, ForumResearcher => -14, + ConceptFinder => -15, + ConceptMatcher => -16, + ConceptDeduplicator => -17, } end diff --git a/lib/post_extensions.rb b/lib/post_extensions.rb index 04a28a156..3a06495f6 100644 --- a/lib/post_extensions.rb +++ b/lib/post_extensions.rb @@ -11,6 +11,8 @@ module PostExtensions -> { where(classification_type: "sentiment") }, class_name: "ClassificationResult", as: :target + + has_and_belongs_to_many :inferred_concepts end end end diff --git a/lib/topic_extensions.rb b/lib/topic_extensions.rb index 7ab36493d..659a33923 100644 --- a/lib/topic_extensions.rb +++ b/lib/topic_extensions.rb @@ -11,6 +11,8 @@ module TopicExtensions -> { where(summary_type: AiSummary.summary_types[:gist]) }, class_name: "AiSummary", as: :target + + has_and_belongs_to_many :inferred_concepts end end end diff --git a/spec/lib/personas/persona_spec.rb b/spec/lib/personas/persona_spec.rb index d3e905680..fe310ef87 100644 --- a/spec/lib/personas/persona_spec.rb +++ b/spec/lib/personas/persona_spec.rb @@ -17,6 +17,7 @@ def system_prompt {participants} {time} {resource_url} + {inferred_concepts} PROMPT end end @@ -38,6 +39,7 @@ def system_prompt end let(:resource_url) { "https://path-to-resource" } + let(:inferred_concepts) { %w[bulbassaur charmander squirtle].join(", ") } let(:context) do DiscourseAi::Personas::BotContext.new( @@ -47,6 +49,7 @@ def system_prompt time: Time.zone.now, participants: topic_with_users.allowed_users.map(&:username).join(", "), resource_url: resource_url, + inferred_concepts: inferred_concepts, ) end @@ -66,6 +69,7 @@ def system_prompt expect(system_message).to include("joe, jane") expect(system_message).to include(Time.zone.now.to_s) expect(system_message).to include(resource_url) + expect(system_message).to include(inferred_concepts) tools = rendered.tools