Spaces:
Sleeping
Sleeping
import streamlit as st | |
import concurrent.futures | |
import random | |
from langchain_google_genai import ChatGoogleGenerativeAI | |
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1) | |
gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1) | |
def pdf_extractor(link): | |
text = '' | |
loader = PyPDFLoader(link) | |
pages = loader.load_and_split() | |
for page in pages: | |
text += page.page_content | |
return [text] | |
def web_extractor(link): | |
text = '' | |
loader = WebBaseLoader(link) | |
pages = loader.load_and_split() | |
for page in pages: | |
text += page.page_content | |
return [text] | |
def feature_extraction(tag, history, context): | |
prompt = f''' | |
You are an intelligent assistant tasked with updating product information. You have two data sources: | |
1. Tag_History: Previously gathered information about the product. | |
2. Tag_Context: New data that might contain additional details. | |
Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD. | |
Guidelines: | |
- Only add new details that are relevant to the {tag} FIELD. | |
- Do not add or modify any other fields in the Tag_History. | |
- Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information. | |
Here is the data: | |
Tag_Context: {str(context)} | |
Tag_History: {history} | |
Respond with the updated Tag_History. | |
''' | |
model = random.choice([gemini, gemini1]) | |
result = model.invoke(prompt) | |
return result.content | |
def main(link): | |
history = { | |
"Introduction": "", | |
"Specifications": "", | |
"Product Overview": "", | |
"Safety Information": "", | |
"Installation Instructions": "", | |
"Setup and Configuration": "", | |
"Operation Instructions": "", | |
"Maintenance and Care": "", | |
"Troubleshooting": "", | |
"Warranty Information": "", | |
"Legal Information": "" | |
} | |
# Extract Text | |
if link.endswith('.md') or link[8:11] == 'en.': | |
text = web_extractor(link) | |
else: | |
text = pdf_extractor(link) | |
# Create Chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=10000, | |
chunk_overlap=100, | |
separators=["", '', " "] | |
) | |
chunks = text_splitter.create_documents(text) | |
for idx, chunk in enumerate(chunks): | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
future_to_key = { | |
executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history | |
} | |
for future in concurrent.futures.as_completed(future_to_key): | |
key = future_to_key[future] | |
try: | |
response = future.result() | |
history[key] = response | |
st.write(f"Intermediate result for {key}: {response}") | |
except Exception as e: | |
st.write(f"Error processing {key}: {e}") | |
return history | |
st.title('Extract Fields from Product Manuals') | |
link = st.text_input('Enter the link to the product document:') | |
if st.button('Process'): | |
if link: | |
final_result = main(link) | |
st.write('Final extracted fields/tags:') | |
st.json(final_result) | |
else: | |
st.write('Please enter a valid link.') | |