{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ " # Alternative Embeddings\n", " \n", " This notebook demonstrates how to use alternative embedding functions.\n", " " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import chromadb" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "client = chromadb.Client()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from chromadb.utils import embedding_functions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Using OpenAI Embeddings. This assumes you have the openai package installed\n", "openai_ef = embedding_functions.OpenAIEmbeddingFunction(\n", " api_key=\"OPENAI_KEY\", # Replace with your own OpenAI API key\n", " model_name=\"text-embedding-ada-002\"\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Create a new chroma collection\n", "openai_collection = client.get_or_create_collection(name=\"openai_embeddings\", embedding_function=openai_ef)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "openai_collection.add(\n", " documents=[\"This is a document\", \"This is another document\"],\n", " metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n", " ids=[\"id1\", \"id2\"]\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ids': [['id1', 'id2']],\n", " 'distances': [[0.1385088860988617, 0.2017185091972351]],\n", " 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n", " 'embeddings': None,\n", " 'documents': [['This is a document', 'This is another document']]}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = openai_collection.query(\n", " query_texts=[\"This is a query document\"],\n", " n_results=2\n", ")\n", "results" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Using Cohere Embeddings. This assumes you have the cohere package installed\n", "cohere_ef = embedding_functions.CohereEmbeddingFunction(\n", " api_key=\"COHERE_API_KEY\", \n", " model_name=\"large\"\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Create a new chroma collection\n", "cohere_collection = client.create_collection(name=\"cohere_embeddings\", embedding_function=cohere_ef)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "cohere_collection.add(\n", " documents=[\"This is a document\", \"This is another document\"],\n", " metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n", " ids=[\"id1\", \"id2\"]\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ids': [['id1', 'id2']],\n", " 'embeddings': None,\n", " 'documents': [['This is a document', 'This is another document']],\n", " 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n", " 'distances': [[4343.1328125, 5653.28759765625]]}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = cohere_collection.query(\n", " query_texts=[\"This is a query document\"],\n", " n_results=2\n", ")\n", "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Using Instructor models. The embedding function requires the InstructorEmbedding package. \n", "# To install it, run pip install InstructorEmbedding\n", "\n", "\n", "#uses base model and cpu\n", "instructor_ef = embedding_functions.InstructorEmbeddingFunction() \n", "\n", "# For task specific embeddings, add an instruction\n", "# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n", "# instruction=\"Represent the Wikipedia document for retrieval: \"\n", "# )\n", "\n", "# Uses hkunlp/instructor-xl model and GPU\n", "#instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name=\"hkunlp/instructor-xl\", device=\"cuda\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a collection with the instructor embedding function\n", "instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "instructor_collection.add(\n", " documents=[\"This is a document\", \"This is another document\"],\n", " metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n", " ids=[\"id1\", \"id2\"]\n", ")\n", "\n", "# Adding documents with an instruction\n", "# instructor_ef = embedding_functions.InstructorEmbeddingFunction(\n", "# instruction=\"Represent the Science sentence: \"\n", "# )\n", "# instructor_collection = client.create_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n", "# instructor_collection.add(documents=[\"Parton energy loss in QCD matter\"], ids=[\"id1\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results = instructor_collection.query(\n", " query_texts=[\"This is a query document\"],\n", " n_results=2\n", ")\n", "results\n", "\n", "# Querying with an instruction\n", "# instructor_ef = embedding_functions.InstructorEmbeddingFunction(instruction=\"Represent the Wikipedia question for retrieving supporting documents: \")\n", "# instructor_collection = client.get_collection(name=\"instructor_embeddings\", embedding_function=instructor_ef)\n", "# results = instructor_collection.query(query_texts=[\"where is the food stored in a yam plant\"])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Using HuggingFace models. The embedding function a huggingface api_key\n", "huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(\n", " api_key=\"HUGGINGFACE_API_KEY\", # Replace with your own HuggingFace API key\n", " model_name=\"sentence-transformers/all-MiniLM-L6-v2\"\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Create a new HuggingFace collection\n", "huggingface_collection = client.create_collection(name=\"huggingface_embeddings\", embedding_function=huggingface_ef)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "huggingface_collection.add(\n", " documents=[\"This is a document\", \"This is another document\"],\n", " metadatas=[{\"source\": \"my_source\"}, {\"source\": \"my_source\"}],\n", " ids=[\"id1\", \"id2\"]\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ids': [['id1', 'id2']],\n", " 'embeddings': None,\n", " 'documents': [['This is a document', 'This is another document']],\n", " 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],\n", " 'distances': [[0.7111215591430664, 1.010978102684021]]}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = huggingface_collection.query(\n", " query_texts=[\"This is a query document\"],\n", " n_results=2\n", ")\n", "results" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 2 }