Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """langchain_vectara.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J | |
| """ | |
| !pip install -r requirements.txt | |
| !pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.embeddings.fake import FakeEmbeddings | |
| from langchain_community.vectorstores import Vectara | |
| from langchain_text_splitters import CharacterTextSplitter | |
| from google.colab import userdata | |
| TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY') | |
| vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID') | |
| vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID') | |
| vectara_api_key = userdata.get('VECTARA_API_KEY') | |
| vectorstore = Vectara( | |
| vectara_customer_id=vectara_customer_id, | |
| vectara_corpus_id=vectara_corpus_id, | |
| vectara_api_key=vectara_api_key | |
| ) | |
| from langchain_community.document_loaders import UnstructuredPDFLoader | |
| !mkdir docs | |
| # upload sample file | |
| loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast') | |
| data = loader.load() | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| docs = text_splitter.split_documents(data) | |
| import json | |
| from langchain_community.document_transformers import DoctranPropertyExtractor | |
| from langchain_core.documents import Document | |
| properties = [ | |
| { | |
| "name": "document_number", | |
| "description": "Unique identifier for the document within its project.", | |
| "type": "string", | |
| "required": True | |
| }, | |
| { | |
| "name": "discipline", | |
| "description": "The discipline associated with the document.", | |
| "type": "string", | |
| "required": True | |
| }, | |
| { | |
| "name": "title", | |
| "description": "Title of the document.", | |
| "type": "string", | |
| "required": True | |
| }, | |
| { | |
| "name": "version", | |
| "description": "Version number of the document.", | |
| "type": "integer", | |
| "required": True | |
| }, | |
| { | |
| "name": "date", | |
| "description": "Creation date of the document.", | |
| "type": "string", | |
| "format": "date", | |
| "required": True | |
| }, | |
| { | |
| "name": "author", | |
| "description": "Author of the document.", | |
| "type": "object", | |
| "properties": { | |
| "name": { | |
| "type": "string", | |
| "required": True | |
| }, | |
| "email": { | |
| "type": "string", | |
| "format": "email", | |
| "required": False | |
| } | |
| }, | |
| "required": True | |
| }, | |
| { | |
| "name": "related_documents", | |
| "description": "List of related documents.", | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "required": False | |
| }, | |
| { | |
| "name": "status", | |
| "description": "Current status of the document.", | |
| "type": "string", | |
| "enum": ["draft", "under_review", "approved", "rejected"], | |
| "required": True | |
| }, | |
| { | |
| "name": "keywords", | |
| "description": "Keywords associated with the document.", | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "required": False | |
| }, | |
| { | |
| "name": "summary", | |
| "description": "Short summary of the document content.", | |
| "type": "string", | |
| "required": False | |
| } | |
| ] | |
| property_extractor = DoctranPropertyExtractor(properties=properties) | |
| from dotenv import load_dotenv | |
| load_dotenv() |