Spaces:

kristada673
/

WebDemoLLM

Runtime error

App Files Files Community

kristada673 commited on Jun 18, 2023

Commit

88c1065

1 Parent(s): 150092a

Upload 16 files

Browse files

Files changed (17) hide show

.gitattributes +4 -0
10K_Annual_Reports/Alphabet.pdf +0 -0
10K_Annual_Reports/Amazon.pdf +0 -0
10K_Annual_Reports/Apple.pdf +0 -0
10K_Annual_Reports/Meta.pdf +3 -0
10K_Annual_Reports/Microsoft.pdf +3 -0
10K_Annual_Reports/Netflix.pdf +3 -0
10K_Annual_Reports/Tesla.pdf +3 -0
VectorStoreIndex/chroma-collections.parquet +3 -0
VectorStoreIndex/chroma-embeddings.parquet +3 -0
VectorStoreIndex/index/id_to_uuid_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl +3 -0
VectorStoreIndex/index/index_4687da76-fa8c-47cd-96a2-c9f3fc08313a.bin +3 -0
VectorStoreIndex/index/index_metadata_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl +3 -0
VectorStoreIndex/index/uuid_to_id_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl +3 -0
app.py +46 -0
requirements.txt +7 -0
vectorstore.py +86 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,7 @@ Web[[:space:]]Application/10K_Annual_Reports/Meta.pdf filter=lfs diff=lfs merge=
 Web[[:space:]]Application/10K_Annual_Reports/Microsoft.pdf filter=lfs diff=lfs merge=lfs -text
 Web[[:space:]]Application/10K_Annual_Reports/Netflix.pdf filter=lfs diff=lfs merge=lfs -text
 Web[[:space:]]Application/10K_Annual_Reports/Tesla.pdf filter=lfs diff=lfs merge=lfs -text

 Web[[:space:]]Application/10K_Annual_Reports/Microsoft.pdf filter=lfs diff=lfs merge=lfs -text
 Web[[:space:]]Application/10K_Annual_Reports/Netflix.pdf filter=lfs diff=lfs merge=lfs -text
 Web[[:space:]]Application/10K_Annual_Reports/Tesla.pdf filter=lfs diff=lfs merge=lfs -text
+10K_Annual_Reports/Meta.pdf filter=lfs diff=lfs merge=lfs -text
+10K_Annual_Reports/Microsoft.pdf filter=lfs diff=lfs merge=lfs -text
+10K_Annual_Reports/Netflix.pdf filter=lfs diff=lfs merge=lfs -text
+10K_Annual_Reports/Tesla.pdf filter=lfs diff=lfs merge=lfs -text

10K_Annual_Reports/Alphabet.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

10K_Annual_Reports/Amazon.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

10K_Annual_Reports/Apple.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

10K_Annual_Reports/Meta.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a2bebf058c6e947c09f9fdb510010a92f6698b458941956ad0bbdaa043ae6de
+size 1111637

10K_Annual_Reports/Microsoft.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50507a219c93a452c1a15e1c5bb5d01d53a97d75c1ce91ea0a9703ef7debca95
+size 1547825

10K_Annual_Reports/Netflix.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d95d9d4a03473863582a234e8edfd97eac97f1be9e552f9467e95dd8ce61280e
+size 1410523

10K_Annual_Reports/Tesla.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a2bfbae724f9f4a7b28539993ca79c54db6ead3ff5105693a546e5a2134bbde
+size 2659773

VectorStoreIndex/chroma-collections.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:250cf833cc54545b03d2454a5ff23eda3e047f8a3c465d29243f2e697b095848
+size 557

VectorStoreIndex/chroma-embeddings.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32c73aa28836865bbc6964cb3f8f0a540b9639828e39b6fa3c4ae0cb7fc7a1a3
+size 114611418

VectorStoreIndex/index/id_to_uuid_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c839634ab7858bf13401325e4055d5a3df0dcd5984705ecd5d83a79966363e0e
+size 150307

VectorStoreIndex/index/index_4687da76-fa8c-47cd-96a2-c9f3fc08313a.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4133367b79c8a0bed4a21a4885f7d35008f9bc69c9fd0b513eafcfb59faddb0b
+size 29136520

VectorStoreIndex/index/index_metadata_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4be5c2f38188c24bc82c3ae21db9bcbf876838e71e8b95c31435a90f960c26f2
+size 74

VectorStoreIndex/index/uuid_to_id_4687da76-fa8c-47cd-96a2-c9f3fc08313a.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecdc7842716c5b96c8438096e1d1f5a276da5742cf13ea2101f83de45c0f5456
+size 175727

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os, gradio
+from langchain.document_loaders import UnstructuredPDFLoader
+from langchain.indexes import VectorstoreIndexCreator
+from vectorstore import VectorstoreIndexCreator
+os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
+text_folder = '10K_Annual_Reports'
+loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]
+# Create the index, if it does not exist, and save it
+if not os.path.isfile('VectorStoreIndex/chroma-embeddings.parquet'):
+  from langchain.vectorstores import Chroma
+  index = VectorstoreIndexCreator(vectorstore_cls=Chroma, vectorstore_kwargs={ "persist_directory": "VectorStoreIndex/"}).from_loaders(loaders)
+  index.vectorstore.persist()
+# Load the saved index
+index_saved = VectorstoreIndexCreator().from_persistent_index("VectorStoreIndex/")
+description = """This is an AI conversational agent where you provide it with the annual reports of companies, and it can study it and answer any questions
+you have about it. Currently, the LLM has been trained on the following companies' 10-K reports: Amazon, Apple, Alphabet (Google), Meta (Facebook), Microsoft,
+Netflix and Tesla.' I plan to include more companies' 10-K reports in future.
+Once the LLM is trained on a new 10-K report, it stores the vector embeddings of the document locally using ChromaDB to make the querying faster and also to
+save time and money on creating the vector embeddings for the same document in future.
+The LLM's universe is only the 10-K reports it has been trained on; it cannot pull information from the internet. So, you can ask it about anything that's
+contained in their 10-K reports. If it cannot find an answer to your query within the 10-K reports, it will reply with "I don't know". Some example of questions
+you can ask are:
+    - What are the risks for Tesla?
+    - What was Google's earnings for the last fiscal year?
+    - Who are the competetors of Apple?
+An example of querying about something the LLM's training did not include:
+    - Query:    "What is Tesco?"
+    - Response: " Tesco is not mentioned in the context, so I don't know."
+"""
+def chat_response(query):
+  return index_saved.query(query)
+interface = gradio.Interface(fn=chat_response, inputs="text", outputs="text", title='Annual Reports GPT', description=description)
+interface.launch() #server_name="0.0.0.0", server_port=8080, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+langchain
+unstructured
+openai
+chromadb
+unstructured
+tiktoken

vectorstore.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import Any, List, Optional, Type
+from pydantic import BaseModel, Extra, Field
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
+from langchain.chains.retrieval_qa.base import RetrievalQA
+from langchain.document_loaders.base import BaseLoader
+from langchain.embeddings.base import Embeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.llms.openai import OpenAI
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
+from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.chroma import Chroma
+def _get_default_text_splitter() -> TextSplitter:
+    return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+class VectorStoreIndexWrapper(BaseModel):
+    """Wrapper around a vectorstore for easy access."""
+    vectorstore: VectorStore
+    class Config:
+        """Configuration for this pydantic object."""
+        extra = Extra.forbid
+        arbitrary_types_allowed = True
+    def query(
+        self, question: str, llm: Optional[BaseLanguageModel] = None, **kwargs: Any
+    ) -> str:
+        """Query the vectorstore."""
+        llm = llm or OpenAI(temperature=0)
+        chain = RetrievalQA.from_chain_type(
+            llm, retriever=self.vectorstore.as_retriever(), **kwargs
+        )
+        return chain.run(question)
+    def query_with_sources(
+        self, question: str, llm: Optional[BaseLanguageModel] = None, **kwargs: Any
+    ) -> dict:
+        """Query the vectorstore and get back sources."""
+        llm = llm or OpenAI(temperature=0)
+        chain = RetrievalQAWithSourcesChain.from_chain_type(
+            llm, retriever=self.vectorstore.as_retriever(), **kwargs
+        )
+        return chain({chain.question_key: question})
+class VectorstoreIndexCreator(BaseModel):
+    """Logic for creating indexes."""
+    vectorstore_cls: Type[VectorStore] = Chroma
+    embedding: Embeddings = Field(default_factory=OpenAIEmbeddings)
+    text_splitter: TextSplitter = Field(default_factory=_get_default_text_splitter)
+    vectorstore_kwargs: dict = Field(default_factory=dict)
+    class Config:
+        """Configuration for this pydantic object."""
+        extra = Extra.forbid
+        arbitrary_types_allowed = True
+    def from_loaders(self, loaders: List[BaseLoader]) -> VectorStoreIndexWrapper:
+        """Create a vectorstore index from loaders."""
+        docs = []
+        for loader in loaders:
+            docs.extend(loader.load())
+        return self.from_documents(docs)
+    def from_documents(self, documents: List[Document]) -> VectorStoreIndexWrapper:
+        """Create a vectorstore index from documents."""
+        sub_docs = self.text_splitter.split_documents(documents)
+        vectorstore = self.vectorstore_cls.from_documents(
+            sub_docs, self.embedding, **self.vectorstore_kwargs
+        )
+        return VectorStoreIndexWrapper(vectorstore=vectorstore)
+    def from_persistent_index(self, path: str) -> VectorStoreIndexWrapper:
+        """Load a vectorstore index from a persistent index."""
+        vectorstore = self.vectorstore_cls(persist_directory=path, embedding_function=self.embedding)
+        return VectorStoreIndexWrapper(vectorstore=vectorstore)