Skip to content

Added rag-retrieval-timing-tests #1361

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/.env.development
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
PINECONE_API_KEY=
QDRANT_API_KEY=
ZILLIZ_API_KEY=
WCS_API_KEY=
OPENAI_API_KEY=
HF_TOKEN=
7 changes: 7 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Rag Timing Tests

This script runs timing tests for common rag systems.

To run it copy `.env.deveopment` to `.env` and make sure to set the appropriate variables in the `.env` file, install the dependencies in `requirements.txt` and run `python3 __main__.py`.

Notice that this script assumes certain actions to create databases or setup "collections" have been performed for each cloud provider. See the script for more details.
161 changes: 161 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import time
import asyncio

import postgresml as pgl
import zilliz_local as zl
import pinecone_local as pl
import qdrant_local as ql
import openai_local as al
import huggingface as hf
import weaviate_local as wl

TRIAL_COUNT = 2

# The pairs we are testing with
tests = [
{
"name": "PostgresML",
"vector_store": pgl,
"rag+": True,
"chatbot_service": al,
"async": True,
},
{"name": "Weaviate", "vector_store": wl, "chatbot_service": al, "rag++": True},
{
"name": "Zilliz",
"vector_store": zl,
"embedding_service": hf,
"chatbot_service": al,
},
{
"name": "Pinecone",
"vector_store": pl,
"embedding_service": hf,
"chatbot_service": al,
},
{
"name": "Qdrant",
"vector_store": ql,
"embedding_service": hf,
"chatbot_service": al,
},
]


# Our documents
# We only really need to test on 2. When we search we are trying to get the first document back
documents = [
{"id": "0", "metadata": {"text": "The hidden value is 1000"}},
{
"id": "1",
"metadata": {"text": "This is just some random text"},
},
]


def maybe_do_async(func, check_dict, *args):
if "async" in check_dict and check_dict["async"]:
return asyncio.run(func(*args))
else:
return func(*args)


def do_data_upsert(name, vector_store, **kwargs):
print(f"Doing Data Upsert For: {name}")
if "rag++" in kwargs or "rag+" in kwargs:
maybe_do_async(vector_store.upsert_data, kwargs, documents)
else:
texts = [d["metadata"]["text"] for d in documents]
(embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings(texts)
maybe_do_async(vector_store.upsert_data, kwargs, documents, embeddings)
print(f"Done Doing Data Upsert For: {name}\n")


def do_normal_rag_test(name, vector_store, **kwargs):
print(f"Doing RAG Test For: {name}")
query = "What is the hidden value?"
if "rag++" in kwargs:
(result, time_to_complete) = maybe_do_async(
vector_store.get_llm_response, kwargs, query
)
time_to_embed = 0
time_to_search = 0
elif "rag+" in kwargs:
time_to_embed = 0
(context, time_to_search) = maybe_do_async(
vector_store.do_search, kwargs, query
)
(result, time_to_complete) = kwargs["chatbot_service"].get_llm_response(
query, context
)
else:
(embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings(
[query]
)
(context, time_to_search) = vector_store.do_search(embeddings[0])
(result, time_to_complete) = kwargs["chatbot_service"].get_llm_response(
query, context
)
print(f"\tThe LLM Said: {result}")
time_for_retrieval = time_to_embed + time_to_search
total_time = time_to_embed + time_to_search + time_to_complete
print(f"Done Doing RAG Test For: {name}")
print(f"- Time to Embed: {time_to_embed}")
print(f"- Time to Search: {time_to_search}")
print(f"- Total Time for Retrieval: {time_for_retrieval}")
print(f"- Time for Chatbot Completion: {time_to_complete}")
print(f"- Total Time Taken: {total_time}\n")
return {
"time_to_embed": time_to_embed,
"time_to_search": time_to_search,
"time_for_retrieval": time_for_retrieval,
"time_to_complete": time_to_complete,
"total_time": total_time,
}


if __name__ == "__main__":
print("----------Doing Data Setup-------------------------\n")
for test in tests:
do_data_upsert(**test)
print("\n----------Done Doing Data Setup------------------\n\n")

print("----------Doing Rag Tests-------------------------\n")
stats = {}
for i in range(TRIAL_COUNT):
for test in tests:
times = do_normal_rag_test(**test)
if not test["name"] in stats:
stats[test["name"]] = []
stats[test["name"]].append(times)
print("\n----------Done Doing Rag Tests---------------------\n")

print("------------Final Results---------------------------\n")
for test in tests:
trials = stats[test["name"]]
(
time_to_embed,
time_to_search,
time_for_retrieval,
time_to_complete,
total_time,
) = [
sum(trial[key] for trial in trials)
for key in [
"time_to_embed",
"time_to_search",
"time_for_retrieval",
"time_to_complete",
"total_time",
]
]
print(f'Done Doing RAG Test For: {test["name"]}')
print(f"- Average Time to Embed: {(time_to_embed / TRIAL_COUNT):0.4f}")
print(f"- Average Time to Search: {(time_to_search / TRIAL_COUNT):0.4f}")
print(
f"- Average Total Time for Retrieval: {(time_for_retrieval / TRIAL_COUNT):0.4f}"
)
print(
f"- Average Time for Chatbot Completion: {(time_to_complete / TRIAL_COUNT):0.4f}"
)
print(f"- Average Total Time Taken: {(total_time / TRIAL_COUNT):0.4f}\n")
29 changes: 29 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import requests
import time
import os
import sys
from dotenv import load_dotenv

# Load our environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")


# Get the embedding from HuggingFace
def get_embeddings(inputs):
print("\tGetting embeddings from HuggingFace")
tic = time.perf_counter()
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
payload = {"inputs": inputs}
response = requests.post(
"https://api-inference.huggingface.co/pipeline/feature-extraction/intfloat/e5-small",
headers=headers,
json=payload,
)
toc = time.perf_counter()
time_taken = toc - tic
print(f"\tDone getting embeddings: {toc - tic:0.4f}\n")
response = response.json()
if "error" in response:
sys.exit(response)
return (response, time_taken)
26 changes: 26 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/openai_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from openai import OpenAI
import time

# Create our OpenAI client
client = OpenAI()


# Get LLM response from OpenAI
def get_llm_response(query, context):
print("\tGetting LLM response from OpenAI")
tic = time.perf_counter()
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": f"You are a helpful assistant. Given the context, provide an answer to the user: \n{context}",
},
{"role": "user", "content": query},
],
)
toc = time.perf_counter()
time_taken = toc - tic
print(f"\tDone getting the LLM response: {time_taken:0.4f}")
response = completion.choices[0].message.content
return (response, time_taken)
43 changes: 43 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/pinecone_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import time
import os

# Load our environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Create our Pinecone client
# Note we created their default index using their gcp-start region and us-central1 region
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("test")


# Store some initial documents to retrieve
def upsert_data(documents, embeddings):
for document, embedding in zip(documents, embeddings):
document["values"] = embedding
print("\tStarting PineCone upsert")
tic = time.perf_counter()
index.upsert(documents, namespace="ns1")
toc = time.perf_counter()
time_taken_to_upsert = toc - tic
print(f"\tDone PineCone upsert: {time_taken_to_upsert:0.4f}")
return time_taken_to_upsert


# Do cosine similarity search over our pinecone index
def do_search(vector):
print("\tDoing cosine similarity search with PineCone")
tic = time.perf_counter()
results = index.query(
namespace="ns1",
vector=vector,
top_k=1,
include_metadata=True,
)
toc = time.perf_counter()
time_done = toc - tic
print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n")
result = results["matches"][0]["metadata"]["text"]
return (result, time_done)
62 changes: 62 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/postgresml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from pgml import Collection, Pipeline
from dotenv import load_dotenv
import time

# Load our environment variables
load_dotenv()

# Initialize our Collection and Pipeline
collection = Collection("test_collection")
pipeline = Pipeline(
"test_pipeline",
{
"text": {
"semantic_search": {
"model": "intfloat/e5-small",
},
}
},
)


# Add the Pipeline to our collection
# We only need to do this once
async def setup_pipeline():
await collection.add_pipeline(pipeline)


async def upsert_data(documents):
documents = [
{"id": document["id"], "text": document["metadata"]["text"]}
for document in documents
]
print("Starting PostgresML upsert")
tic = time.perf_counter()
await collection.upsert_documents(documents)
toc = time.perf_counter()
time_taken = toc - tic
print(f"Done PostgresML upsert: {time_taken:0.4f}\n")


async def do_search(query):
print(
"\tDoing embedding and cosine similarity search over our PostgresML Collection"
)
tic = time.perf_counter()
results = await collection.vector_search(
{
"query": {
"fields": {
"text": {
"query": query,
},
}
},
"limit": 1,
},
pipeline,
)
toc = time.perf_counter()
time_taken = toc - tic
print(f"\tDone doing embedding and cosine similarity search: {time_taken:0.4f}\n")
return (results[0]["chunk"], time_taken)
49 changes: 49 additions & 0 deletions pgml-apps/rag-retrieval-timing-tests/qdrant_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from dotenv import load_dotenv
import time
import os

# Load our environment variables
load_dotenv()
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# Create our Qdrant client
qdrant = QdrantClient(
url="https://059364f6-62c5-4f80-9f19-cf6d6394caae.us-east4-0.gcp.cloud.qdrant.io:6333",
api_key=QDRANT_API_KEY,
)

# Create our Qdrant collection
qdrant.recreate_collection(
collection_name="test",
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)


# Store some initial documents to retrieve
def upsert_data(documents, embeddings):
points = [
PointStruct(
id=int(document["id"]), vector=embedding, payload=document["metadata"]
)
for document, embedding in zip(documents, embeddings)
]
print("\tStarting Qdrant upsert")
tic = time.perf_counter()
qdrant.upsert(collection_name="test", points=points)
toc = time.perf_counter()
time_taken_to_upsert = toc - tic
print(f"\tDone Qdrant upsert: {time_taken_to_upsert:0.4f}")
return time_taken_to_upsert


# Do cosine similarity search over our Qdrant collection
def do_search(vector):
print("\tDoing cosine similarity search with Qdrant")
tic = time.perf_counter()
results = qdrant.search(collection_name="test", query_vector=vector, limit=1)
toc = time.perf_counter()
time_done = toc - tic
print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n")
return (results, time_done)
Loading