Spaces:

Prathmesh48
/

Extract-Fields

Sleeping

File size: 3,680 Bytes

1ff214d

import streamlit as st
import concurrent.futures
import random
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA', temperature=0.1)
gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001", google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc', temperature=0.1)

def pdf_extractor(link):
    text = ''
    loader = PyPDFLoader(link)
    pages = loader.load_and_split()
    for page in pages:
        text += page.page_content
    return [text]

def web_extractor(link):
    text = ''
    loader = WebBaseLoader(link)
    pages = loader.load_and_split()
    for page in pages:
        text += page.page_content
    return [text]

def feature_extraction(tag, history, context):
    prompt = f'''
    You are an intelligent assistant tasked with updating product information. You have two data sources:
    1. Tag_History: Previously gathered information about the product.
    2. Tag_Context: New data that might contain additional details.

    Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.

    Guidelines:
    - Only add new details that are relevant to the {tag} FIELD.
    - Do not add or modify any other fields in the Tag_History.
    - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.

    Here is the data:

    Tag_Context: {str(context)}
    Tag_History: {history}

    Respond with the updated Tag_History.
    '''
    model = random.choice([gemini, gemini1])
    result = model.invoke(prompt)
    return result.content

def main(link): 
    history = {
        "Introduction": "",
        "Specifications": "",
        "Product Overview": "",
        "Safety Information": "",
        "Installation Instructions": "",
        "Setup and Configuration": "",
        "Operation Instructions": "",
        "Maintenance and Care": "",
        "Troubleshooting": "",
        "Warranty Information": "",
        "Legal Information": ""
    }

    # Extract Text
    if link.endswith('.md') or link[8:11] == 'en.':
        text = web_extractor(link)
    else:
        text = pdf_extractor(link)

    # Create Chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=100,
        separators=["", '', " "]
    )
    chunks = text_splitter.create_documents(text)

    for idx, chunk in enumerate(chunks):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_key = {
                executor.submit(feature_extraction, key, history[key], chunk.page_content): key for key in history
            }
            for future in concurrent.futures.as_completed(future_to_key):
                key = future_to_key[future]
                try:
                    response = future.result()
                    history[key] = response
                    st.write(f"Intermediate result for {key}: {response}")
                except Exception as e:
                    st.write(f"Error processing {key}: {e}")

    return history

st.title('Extract Fields from Product Manuals')
link = st.text_input('Enter the link to the product document:')
if st.button('Process'):
    if link:
        final_result = main(link)
        st.write('Final extracted fields/tags:')
        st.json(final_result)
    else:
        st.write('Please enter a valid link.')