Spaces:
Sleeping
Sleeping
File size: 3,680 Bytes
1ff214d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import streamlit as st
import concurrent.futures
import random
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1)
gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1)
def pdf_extractor(link):
text = ''
loader = PyPDFLoader(link)
pages = loader.load_and_split()
for page in pages:
text += page.page_content
return [text]
def web_extractor(link):
text = ''
loader = WebBaseLoader(link)
pages = loader.load_and_split()
for page in pages:
text += page.page_content
return [text]
def feature_extraction(tag, history, context):
prompt = f'''
You are an intelligent assistant tasked with updating product information. You have two data sources:
1. Tag_History: Previously gathered information about the product.
2. Tag_Context: New data that might contain additional details.
Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
Guidelines:
- Only add new details that are relevant to the {tag} FIELD.
- Do not add or modify any other fields in the Tag_History.
- Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
Here is the data:
Tag_Context: {str(context)}
Tag_History: {history}
Respond with the updated Tag_History.
'''
model = random.choice([gemini, gemini1])
result = model.invoke(prompt)
return result.content
def main(link):
history = {
"Introduction": "",
"Specifications": "",
"Product Overview": "",
"Safety Information": "",
"Installation Instructions": "",
"Setup and Configuration": "",
"Operation Instructions": "",
"Maintenance and Care": "",
"Troubleshooting": "",
"Warranty Information": "",
"Legal Information": ""
}
# Extract Text
if link.endswith('.md') or link[8:11] == 'en.':
text = web_extractor(link)
else:
text = pdf_extractor(link)
# Create Chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=10000,
chunk_overlap=100,
separators=["", '', " "]
)
chunks = text_splitter.create_documents(text)
for idx, chunk in enumerate(chunks):
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_key = {
executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history
}
for future in concurrent.futures.as_completed(future_to_key):
key = future_to_key[future]
try:
response = future.result()
history[key] = response
st.write(f"Intermediate result for {key}: {response}")
except Exception as e:
st.write(f"Error processing {key}: {e}")
return history
st.title('Extract Fields from Product Manuals')
link = st.text_input('Enter the link to the product document:')
if st.button('Process'):
if link:
final_result = main(link)
st.write('Final extracted fields/tags:')
st.json(final_result)
else:
st.write('Please enter a valid link.')
|