Skip to content

feat: Update llm.TextEmbeddingGenerator to 005 #1186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions bigframes/ml/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,11 @@
_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT,
)

_TEXT_EMBEDDING_005_ENDPOINT = "text-embedding-005"
_TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004"
_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002"
_TEXT_EMBEDDING_ENDPOINTS = (
_TEXT_EMBEDDING_005_ENDPOINT,
_TEXT_EMBEDDING_004_ENDPOINT,
_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT,
)
Expand Down Expand Up @@ -606,8 +608,8 @@ class TextEmbeddingGenerator(base.BaseEstimator):

Args:
model_name (str, Default to "text-embedding-004"):
The model for text embedding. Possible values are "text-embedding-004" or "text-multilingual-embedding-002".
text-embedding models returns model embeddings for text inputs.
The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004"
or "text-multilingual-embedding-002". text-embedding models returns model embeddings for text inputs.
text-multilingual-embedding models returns model embeddings for text inputs which support over 100 languages.
Default to "text-embedding-004".
session (bigframes.Session or None):
Expand All @@ -621,7 +623,9 @@ def __init__(
self,
*,
model_name: Literal[
"text-embedding-004", "text-multilingual-embedding-002"
"text-embedding-005",
"text-embedding-004",
"text-multilingual-embedding-002",
] = "text-embedding-004",
session: Optional[bigframes.Session] = None,
connection_name: Optional[str] = None,
Expand Down
1 change: 1 addition & 0 deletions bigframes/ml/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator,
llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator,
llm._CLAUDE_3_OPUS_ENDPOINT: llm.Claude3TextGenerator,
llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator,
llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator,
llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator,
}
Expand Down
6 changes: 3 additions & 3 deletions bigframes/operations/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,12 +647,12 @@ def search(
>>> bigframes.options.experiments.semantic_operators = True

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs tests fail on semantic. These changes are unnecessary. Maybe just revert it.


>>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]})
>>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance')
creatures distance
3 chimpanzee 0.781101
3 chimpanzee 0.635844
<BLANKLINE>
[1 rows x 2 columns]

Expand Down Expand Up @@ -945,7 +945,7 @@ def sim_join(
>>> bigframes.options.experiments.semantic_operators = True

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")

>>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']})
>>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']})
Expand Down
2 changes: 1 addition & 1 deletion notebooks/experimental/semantic_operators.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@
"source": [
"import bigframes.ml.llm as llm\n",
"gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)\n",
"text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")"
"text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions owlbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@

# Use a custom table of contents since the default one isn't organized well
# enough for the number of classes we have.
assert 1 == s.replace( # publish-docs.sh
assert 1 == s.replace( # publish-docs.sh
[".kokoro/publish-docs.sh"],
(
re.escape("# upload docs")
Expand All @@ -122,14 +122,14 @@
)

# Fixup the documentation.
assert 1 == s.replace( # docs/conf.py
assert 1 == s.replace( # docs/conf.py
["docs/conf.py"],
re.escape("Google Cloud Client Libraries for bigframes"),
"BigQuery DataFrames provides DataFrame APIs on the BigQuery engine",
)

# Don't omit `*/core/*.py` when counting test coverages
assert 1 == s.replace( # .coveragerc
assert 1 == s.replace( # .coveragerc
[".coveragerc"],
re.escape(" */core/*.py\n"),
"",
Expand Down
2 changes: 1 addition & 1 deletion tests/system/large/operations/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator:
@pytest.fixture(scope="session")
def text_embedding_generator(session, bq_connection) -> llm.TextEmbeddingGenerator:
return llm.TextEmbeddingGenerator(
session=session, connection_name=bq_connection, model_name="text-embedding-004"
session=session, connection_name=bq_connection, model_name="text-embedding-005"
)
6 changes: 3 additions & 3 deletions tests/system/small/ml/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def test_text_generator_predict_with_params_success(

@pytest.mark.parametrize(
"model_name",
("text-embedding-004", "text-multilingual-embedding-002"),
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
)
def test_create_load_text_embedding_generator_model(
dataset_id, model_name, session, bq_connection
Expand All @@ -218,7 +218,7 @@ def test_create_load_text_embedding_generator_model(

@pytest.mark.parametrize(
"model_name",
("text-embedding-004", "text-multilingual-embedding-002"),
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
)
@pytest.mark.flaky(retries=2)
def test_text_embedding_generator_predict_default_params_success(
Expand All @@ -236,7 +236,7 @@ def test_text_embedding_generator_predict_default_params_success(

@pytest.mark.parametrize(
"model_name",
("text-embedding-004", "text-multilingual-embedding-002"),
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
)
@pytest.mark.flaky(retries=2)
def test_text_embedding_generator_multi_cols_predict_success(
Expand Down
Loading