Spaces:

Prathmesh48
/

Extract-Fields

Sleeping

App Files Files Community

Extract-Fields / app.py

Prathmesh48

Create app.py

1ff214d verified 9 months ago

raw

history blame contribute delete

3.68 kB

	import streamlit as st
	import concurrent.futures
	import random
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1)
	gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1)

	def pdf_extractor(link):
	text = ''
	loader = PyPDFLoader(link)
	pages = loader.load_and_split()
	for page in pages:
	text += page.page_content
	return [text]

	def web_extractor(link):
	text = ''
	loader = WebBaseLoader(link)
	pages = loader.load_and_split()
	for page in pages:
	text += page.page_content
	return [text]

	def feature_extraction(tag, history, context):
	prompt = f'''
	You are an intelligent assistant tasked with updating product information. You have two data sources:
	1. Tag_History: Previously gathered information about the product.
	2. Tag_Context: New data that might contain additional details.

	Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.

	Guidelines:
	- Only add new details that are relevant to the {tag} FIELD.
	- Do not add or modify any other fields in the Tag_History.
	- Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.

	Here is the data:

	Tag_Context: {str(context)}
	Tag_History: {history}

	Respond with the updated Tag_History.
	'''
	model = random.choice([gemini, gemini1])
	result = model.invoke(prompt)
	return result.content

	def main(link):
	history = {
	"Introduction": "",
	"Specifications": "",
	"Product Overview": "",
	"Safety Information": "",
	"Installation Instructions": "",
	"Setup and Configuration": "",
	"Operation Instructions": "",
	"Maintenance and Care": "",
	"Troubleshooting": "",
	"Warranty Information": "",
	"Legal Information": ""
	}

	# Extract Text
	if link.endswith('.md') or link[8:11] == 'en.':
	text = web_extractor(link)
	else:
	text = pdf_extractor(link)

	# Create Chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=10000,
	chunk_overlap=100,
	separators=["", '', " "]
	)
	chunks = text_splitter.create_documents(text)

	for idx, chunk in enumerate(chunks):
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_to_key = {
	executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history
	}
	for future in concurrent.futures.as_completed(future_to_key):
	key = future_to_key[future]
	try:
	response = future.result()
	history[key] = response
	st.write(f"Intermediate result for {key}: {response}")
	except Exception as e:
	st.write(f"Error processing {key}: {e}")

	return history

	st.title('Extract Fields from Product Manuals')
	link = st.text_input('Enter the link to the product document:')
	if st.button('Process'):
	if link:
	final_result = main(link)
	st.write('Final extracted fields/tags:')
	st.json(final_result)
	else:
	st.write('Please enter a valid link.')