Spaces:
Runtime error
Runtime error
| import os | |
| import argparse | |
| import json | |
| import openai | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.document_loaders import UnstructuredPDFLoader | |
| from langchain_community.embeddings.fake import FakeEmbeddings | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Vectara | |
| from schema import Metadata, BimDiscipline | |
| load_dotenv() | |
| vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID'] | |
| vectara_corpus_id = os.environ['VECTARA_CORPUS_ID'] | |
| vectara_api_key = os.environ['VECTARA_API_KEY'] | |
| vectorstore = Vectara(vectara_customer_id=vectara_customer_id, | |
| vectara_corpus_id=vectara_corpus_id, | |
| vectara_api_key=vectara_api_key) | |
| prompt_template = """ | |
| BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture'] | |
| You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document." | |
| Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document: | |
| context=" | |
| """ | |
| def ingest(file_path): | |
| extension = file_path.split('.')[-1] | |
| ext = extension.lower() | |
| if ext == 'pdf': | |
| loader = UnstructuredPDFLoader(file_path) | |
| elif ext == 'txt': | |
| loader = TextLoader(file_path) | |
| else: | |
| raise NotImplementedError('Only .txt or .pdf files are supported') | |
| # transform locally | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, | |
| separators=[ | |
| "\n\n", | |
| "\n", | |
| " ", | |
| ",", | |
| "\uff0c", # Fullwidth comma | |
| "\u3001", # Ideographic comma | |
| "\uff0e", # Fullwidth full stop | |
| # "\u200B", # Zero-width space (Asian languages) | |
| # "\u3002", # Ideographic full stop (Asian languages) | |
| "", | |
| ]) | |
| docs = text_splitter.split_documents(documents) | |
| return docs | |
| def extract_metadata(docs): | |
| # plain text | |
| context = "".join( | |
| [doc.page_content.replace('\n\n','').replace('..','') for doc in docs]) | |
| prompt = f'{prompt_template}{context}"' | |
| # Create client | |
| client = openai.OpenAI( | |
| base_url="https://api.together.xyz/v1", | |
| api_key=os.environ["TOGETHER_API_KEY"], | |
| ) | |
| # Call the LLM with the JSON schema | |
| chat_completion = client.chat.completions.create( | |
| model="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": f"You are a helpful assistant that responsds in JSON format" | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ] | |
| ) | |
| created_user = json.loads(chat_completion.choices[0].message.content) | |
| return created_user | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Generate metadata for a BIM document") | |
| parser.add_argument("document", metavar="FILEPATH", type=str, | |
| help="Path to the BIM document") | |
| args = parser.parse_args() | |
| if not os.path.exists(args.document) or not os.path.isfile(args.document): | |
| print("File '{}' not found or not accessible.".format(args.document)) | |
| sys.exit(-1) | |
| docs = ingest(args.document) | |
| metadata = extract_metadata(docs) | |
| print(json.dumps(metadata, indent=2)) |