You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When trying to insert the data in my big query tables, it takes forever. In the following code, I insert one Query with 25 query documents referring to it, and it takes around 70 seconds if I do it from the sqlalchemy, but if I insert them via the big query client, it takes less than a second:
# %%
import uuid
from sqlalchemy import create_engine
from sqlalchemy import Column, ForeignKey, String, TIMESTAMP
from sqlalchemy.ext.declarative import declarative_base
import time
from sqlalchemy.orm import Session
import datetime
# Create engine
path = "path to my big query dataset"
engine = create_engine(
path,
pool_size=100000,
max_overflow=100000,
query_cache_size=100000,
connect_args={},
execution_options={
"fast_executemany": True,
"fast_executemany_batch_size": 100000,
"insertmanyvalues_page_size": 100000, # Optimize batch size
"enable_parallel_execution": True, # Enable parallel execution where possible
},
echo=True,
)
# %%
#create tables
QUERY_TABLE_NAME = "query_log_entries"
QUERY_RESULT_DOCUMENT_TABLE_NAME = "query_result_documents"
Base = declarative_base()
class QueryLog(Base):
__tablename__ = QUERY_TABLE_NAME
query_id = Column(String, primary_key=True)
timestamp = Column(TIMESTAMP)
git_hash = Column(String)
class QueryResultDocument(Base):
__tablename__ = QUERY_RESULT_DOCUMENT_TABLE_NAME
query_id_ref = Column(String, ForeignKey(f"{QUERY_TABLE_NAME}.query_id"))
id = Column(String, primary_key=True)
document_id = Column(String)
# %%
id = str(uuid.uuid4())
# Create a sample query log entry
sample_query_log = QueryLog(
query_id=id,
timestamp=datetime.datetime.now(),
git_hash="abc123"
)
# %%
# %%
docs = []
for i in range(25):
sample_query_result_doc = QueryResultDocument(
query_id_ref=id, # Matches the query_id from sample_query_log
id=str(uuid.uuid4()),
document_id="policy_doc_456",
)
docs.append(sample_query_result_doc)
# Create all tables if they don't exist
Base.metadata.create_all(engine)
# %%
#Adding the data to the tables via sqlalchemy
start_time = time.time()
with Session(engine) as session:
session.add(sample_query_log)
session.bulk_save_objects(docs)
session.commit()
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
# %%
from google.cloud import bigquery
# Initialize BigQuery client
client = bigquery.Client(project='project')
#Adding the data to the tables via bigquery
start_time = time.time()
client.insert_rows_json(f".{QUERY_TABLE_NAME}", [{
'query_id': sample_query_log.query_id,
'timestamp': sample_query_log.timestamp.isoformat(),
'git_hash': sample_query_log.git_hash
}])
client.insert_rows_json(f"project.{QUERY_RESULT_DOCUMENT_TABLE_NAME}", [{
'query_id_ref': doc.query_id_ref,
'id': doc.id,
'document_id': doc.document_id
} for doc in docs])
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
# %%
Do you know what is the reason for such a huge performance difference?
The text was updated successfully, but these errors were encountered:
BigQuery is not very fast at DML transactions and I am assuming the SQLalchemy is inserting 1 row at a time for you (default behavior I believe). Check out this ticket for ideas on how to speed up inserts #497
When trying to insert the data in my big query tables, it takes forever. In the following code, I insert one Query with 25 query documents referring to it, and it takes around 70 seconds if I do it from the sqlalchemy, but if I insert them via the big query client, it takes less than a second:
Do you know what is the reason for such a huge performance difference?
The text was updated successfully, but these errors were encountered: