Added summarizing examples (#955)

SilasMarvin · web-flow · commit 8dce07d6b9b4 · 2023-08-25T16:13:44.000-07:00
diff --git a/pgml-sdks/rust/pgml/javascript/examples/README.md b/pgml-sdks/rust/pgml/javascript/examples/README.md
@@ -11,3 +11,6 @@ In this example, we will use `hknlp/instructor-base` model to build text embeddi
 
 ### [Extractive Question Answering](./extractive_question_answering.js)
 In this example, we will show how to use `vector_recall` result as a `context` to a HuggingFace question answering model. We will use `Builtins.transform()` to run the model on the database.
+
+### [Summarizing Question Answering](./summarizing_question_answering.js)
+This is an example to find documents relevant to a question from the collection of documents and then summarize those documents.
diff --git a/pgml-sdks/rust/pgml/javascript/examples/summarizing_question_answering.js b/pgml-sdks/rust/pgml/javascript/examples/summarizing_question_answering.js
@@ -0,0 +1,63 @@
+const pgml = require("pgml");
+require("dotenv").config();
+
+pgml.js_init_logger();
+
+const main = async () => {
+  // Initialize the collection
+  const collection = pgml.newCollection("my_javascript_sqa_collection");
+
+  // Add a pipeline
+  const model = pgml.newModel();
+  const splitter = pgml.newSplitter();
+  const pipeline = pgml.newPipeline(
+    "my_javascript_sqa_pipeline",
+    model,
+    splitter,
+  );
+  await collection.add_pipeline(pipeline);
+
+  // Upsert documents, these documents are automatically split into chunks and embedded by our pipeline
+  const documents = [
+    {
+      id: "Document One",
+      text: "PostgresML is the best tool for machine learning applications!",
+    },
+    {
+      id: "Document Two",
+      text: "PostgresML is open source and available to everyone!",
+    },
+  ];
+  await collection.upsert_documents(documents);
+
+  const query = "What is the best tool for machine learning?";
+
+  // Perform vector search
+  const queryResults = await collection
+    .query()
+    .vector_recall(query, pipeline)
+    .limit(1)
+    .fetch_all();
+
+  // Construct context from results
+  const context = queryResults
+    .map((result) => {
+      return result[1];
+    })
+    .join("\n");
+
+  // Query for summarization
+  const builtins = pgml.newBuiltins();
+  const answer = await builtins.transform(
+    { task: "summarization", model: "sshleifer/distilbart-cnn-12-6" },
+    [context],
+  );
+
+  // Archive the collection
+  await collection.archive();
+  return answer;
+};
+
+main().then((results) => {
+  console.log("Question summary: \n", results);
+});
diff --git a/pgml-sdks/rust/pgml/python/examples/README.md b/pgml-sdks/rust/pgml/python/examples/README.md
@@ -15,3 +15,6 @@ In this example, we will show how to use `vector_recall` result as a `context` t
 ### [Table Question Answering](./table_question_answering.py)
 In this example, we will use [Open Table-and-Text Question Answering (OTT-QA)
 ](https://github.com/wenhuchen/OTT-QA) dataset to run queries on tables. We will use `deepset/all-mpnet-base-v2-table` model that is trained for embedding tabular data for retrieval tasks. 
+
+### [Summarizing Question Answering](./summarizing_question_answering.py)
+This is an example to find documents relevant to a question from the collection of documents and then summarize those documents.
diff --git a/pgml-sdks/rust/pgml/python/examples/extractive_question_answering.py b/pgml-sdks/rust/pgml/python/examples/extractive_question_answering.py
@@ -56,8 +56,7 @@ async def main():
         "question-answering", [json.dumps({"question": query, "context": context})]
     )
     end = time()
-    console.print("Results for query '%s'" % query, style="bold")
-    console.print(answer)
+    console.print("Answer '%s'" % answer, style="bold")
     console.print("Query time = %0.3f" % (end - start))
 
     # Archive collection
diff --git a/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py b/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py
@@ -0,0 +1,71 @@
+from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger
+import json
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich.console import Console
+import asyncio
+
+
+py_init_logger()
+
+
+async def main():
+    load_dotenv()
+    console = Console()
+
+    # Initialize collection
+    collection = Collection("squad_collection")
+
+    # Create a pipeline using the default model and splitter
+    model = Model()
+    splitter = Splitter()
+    pipeline = Pipeline("squadv1", model, splitter)
+    await collection.add_pipeline(pipeline)
+
+    # Prep documents for upserting
+    data = load_dataset("squad", split="train")
+    data = data.to_pandas()
+    data = data.drop_duplicates(subset=["context"])
+    documents = [
+        {"id": r["id"], "text": r["context"], "title": r["title"]}
+        for r in data.to_dict(orient="records")
+    ]
+
+    # Upsert documents
+    await collection.upsert_documents(documents[:200])
+
+    # Query for context
+    query = "Who won more than 20 grammy awards?"
+    console.print("Querying for context ...")
+    start = time()
+    results = (
+        await collection.query().vector_recall(query, pipeline).limit(5).fetch_all()
+    )
+    end = time()
+    console.print("\n Results for '%s' " % (query), style="bold")
+    console.print(results)
+    console.print("Query time = %0.3f" % (end - start))
+
+    # Construct context from results
+    context = " ".join(results[0][1].strip().split())
+    context = context.replace('"', '\\"').replace("'", "''")
+
+    # Query for summary
+    builtins = Builtins()
+    console.print("Querying for summary ...")
+    start = time()
+    summary = await builtins.transform(
+        {"task": "summarization", "model": "sshleifer/distilbart-cnn-12-6"},
+        [context],
+    )
+    end = time()
+    console.print("Summary '%s'" % summary, style="bold")
+    console.print("Query time = %0.3f" % (end - start))
+
+    # Archive collection
+    await collection.archive()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

Original file line number	Diff line number	Diff line change
`@@ -56,8 +56,7 @@ async def main():`
`56`	`56`	`"question-answering", [json.dumps({"question": query, "context": context})]`
`57`	`57`	`)`
`58`	`58`	`end = time()`
`59`		`- console.print("Results for query '%s'" % query, style="bold")`
`60`		`- console.print(answer)`
	`59`	`+ console.print("Answer '%s'" % answer, style="bold")`
`61`	`60`	`console.print("Query time = %0.3f" % (end - start))`
`62`	`61`
`63`	`62`	`# Archive collection`