From a678f62a5ac59d907e7da9dfc704801f42061c86 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:46:50 -0700 Subject: [PATCH 1/3] Added sumarizing example --- pgml-sdks/rust/pgml/python/examples/README.md | 3 + .../examples/extractive_question_answering.py | 3 +- .../summarizing_question_answering.py | 70 +++++++++++++++++++ 3 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py diff --git a/pgml-sdks/rust/pgml/python/examples/README.md b/pgml-sdks/rust/pgml/python/examples/README.md index e2e22eb6e..81416c038 100644 --- a/pgml-sdks/rust/pgml/python/examples/README.md +++ b/pgml-sdks/rust/pgml/python/examples/README.md @@ -15,3 +15,6 @@ In this example, we will show how to use `vector_recall` result as a `context` t ### [Table Question Answering](./table_question_answering.py) In this example, we will use [Open Table-and-Text Question Answering (OTT-QA) ](https://github.com/wenhuchen/OTT-QA) dataset to run queries on tables. We will use `deepset/all-mpnet-base-v2-table` model that is trained for embedding tabular data for retrieval tasks. + +### [Summarizing Question Answering](./summarizing_question_answering.py) +This is an example to find documents relevant to a question from the collection of documents and then summarize those documents. diff --git a/pgml-sdks/rust/pgml/python/examples/extractive_question_answering.py b/pgml-sdks/rust/pgml/python/examples/extractive_question_answering.py index 21cfc90b5..21b5f2e67 100644 --- a/pgml-sdks/rust/pgml/python/examples/extractive_question_answering.py +++ b/pgml-sdks/rust/pgml/python/examples/extractive_question_answering.py @@ -56,8 +56,7 @@ async def main(): "question-answering", [json.dumps({"question": query, "context": context})] ) end = time() - console.print("Results for query '%s'" % query, style="bold") - console.print(answer) + console.print("Answer '%s'" % answer, style="bold") console.print("Query time = %0.3f" % (end - start)) # Archive collection diff --git a/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py b/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py new file mode 100644 index 000000000..ab0f51f49 --- /dev/null +++ b/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py @@ -0,0 +1,70 @@ +from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console +import asyncio + + +py_init_logger() + +async def main(): + load_dotenv() + console = Console() + + # Initialize collection + collection = Collection("squad_collection") + + # Create a pipeline using the default model and splitter + model = Model() + splitter = Splitter() + pipeline = Pipeline("squadv1", model, splitter) + await collection.add_pipeline(pipeline) + + # Prep documents for upserting + data = load_dataset("squad", split="train") + data = data.to_pandas() + data = data.drop_duplicates(subset=["context"]) + documents = [ + {"id": r["id"], "text": r["context"], "title": r["title"]} + for r in data.to_dict(orient="records") + ] + + # Upsert documents + await collection.upsert_documents(documents[:200]) + + # Query for context + query = "Who won more than 20 grammy awards?" + console.print("Querying for context ...") + start = time() + results = ( + await collection.query().vector_recall(query, pipeline).limit(5).fetch_all() + ) + end = time() + console.print("\n Results for '%s' " % (query), style="bold") + console.print(results) + console.print("Query time = %0.3f" % (end - start)) + + # Construct context from results + context = " ".join(results[0][1].strip().split()) + context = context.replace('"', '\\"').replace("'", "''") + + # Query for summary + builtins = Builtins() + console.print("Querying for summary ...") + start = time() + summary = await builtins.transform( + {"task": "summarization", "model": "sshleifer/distilbart-cnn-12-6"}, + [json.dumps({"question": query, "context": context})], + ) + end = time() + console.print("Summary '%s'" % summary, style="bold") + console.print("Query time = %0.3f" % (end - start)) + + # Archive collection + await collection.archive() + + +if __name__ == "__main__": + asyncio.run(main()) From f3bbab22047bdc033f3983606437309b04a8c70b Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:53:15 -0700 Subject: [PATCH 2/3] Cleaned up --- .../pgml/python/examples/summarizing_question_answering.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py b/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py index ab0f51f49..4c291aac0 100644 --- a/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py +++ b/pgml-sdks/rust/pgml/python/examples/summarizing_question_answering.py @@ -9,6 +9,7 @@ py_init_logger() + async def main(): load_dotenv() console = Console() @@ -50,13 +51,13 @@ async def main(): context = " ".join(results[0][1].strip().split()) context = context.replace('"', '\\"').replace("'", "''") - # Query for summary + # Query for summary builtins = Builtins() console.print("Querying for summary ...") start = time() summary = await builtins.transform( {"task": "summarization", "model": "sshleifer/distilbart-cnn-12-6"}, - [json.dumps({"question": query, "context": context})], + [context], ) end = time() console.print("Summary '%s'" % summary, style="bold") From bd2ebc5aa341615a86d1fda0534ff128eea84a15 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:11:41 -0700 Subject: [PATCH 3/3] Added summarizing example for javascript --- .../rust/pgml/javascript/examples/README.md | 3 + .../summarizing_question_answering.js | 63 +++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 pgml-sdks/rust/pgml/javascript/examples/summarizing_question_answering.js diff --git a/pgml-sdks/rust/pgml/javascript/examples/README.md b/pgml-sdks/rust/pgml/javascript/examples/README.md index 440058e4f..77e13b638 100644 --- a/pgml-sdks/rust/pgml/javascript/examples/README.md +++ b/pgml-sdks/rust/pgml/javascript/examples/README.md @@ -11,3 +11,6 @@ In this example, we will use `hknlp/instructor-base` model to build text embeddi ### [Extractive Question Answering](./extractive_question_answering.js) In this example, we will show how to use `vector_recall` result as a `context` to a HuggingFace question answering model. We will use `Builtins.transform()` to run the model on the database. + +### [Summarizing Question Answering](./summarizing_question_answering.js) +This is an example to find documents relevant to a question from the collection of documents and then summarize those documents. diff --git a/pgml-sdks/rust/pgml/javascript/examples/summarizing_question_answering.js b/pgml-sdks/rust/pgml/javascript/examples/summarizing_question_answering.js new file mode 100644 index 000000000..a5e5fe19b --- /dev/null +++ b/pgml-sdks/rust/pgml/javascript/examples/summarizing_question_answering.js @@ -0,0 +1,63 @@ +const pgml = require("pgml"); +require("dotenv").config(); + +pgml.js_init_logger(); + +const main = async () => { + // Initialize the collection + const collection = pgml.newCollection("my_javascript_sqa_collection"); + + // Add a pipeline + const model = pgml.newModel(); + const splitter = pgml.newSplitter(); + const pipeline = pgml.newPipeline( + "my_javascript_sqa_pipeline", + model, + splitter, + ); + await collection.add_pipeline(pipeline); + + // Upsert documents, these documents are automatically split into chunks and embedded by our pipeline + const documents = [ + { + id: "Document One", + text: "PostgresML is the best tool for machine learning applications!", + }, + { + id: "Document Two", + text: "PostgresML is open source and available to everyone!", + }, + ]; + await collection.upsert_documents(documents); + + const query = "What is the best tool for machine learning?"; + + // Perform vector search + const queryResults = await collection + .query() + .vector_recall(query, pipeline) + .limit(1) + .fetch_all(); + + // Construct context from results + const context = queryResults + .map((result) => { + return result[1]; + }) + .join("\n"); + + // Query for summarization + const builtins = pgml.newBuiltins(); + const answer = await builtins.transform( + { task: "summarization", model: "sshleifer/distilbart-cnn-12-6" }, + [context], + ); + + // Archive the collection + await collection.archive(); + return answer; +}; + +main().then((results) => { + console.log("Question summary: \n", results); +});