import streamlit as st
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from datasets import load_dataset

# Load model and tokenizer
model_path = "rb757/new_app"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the dataset
dataset_dict = load_dataset(
    'HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather",
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
    trust_remote_code=True
)

# Convert to DataFrame
train_df = pd.DataFrame(dataset_dict['train'])
val_df = pd.DataFrame(dataset_dict['validation'])

# Print columns to verify availability
print("Train set columns:", train_df.columns.tolist())
print("Validation set columns:", val_df.columns.tolist())

# Check if 'patent_number' exists
if 'patent_number' not in train_df.columns:
    st.error("Column 'patent_number' not found in the training dataset.")
else:
    # Title and description
    st.title("📜 Milestone Patent Evaluation")
    st.write("Select a patent application to evaluate its patentability.")

    # Dropdown for patent numbers
    patent_numbers = train_df['patent_number'].unique()
    selected_patent = st.selectbox("Select Patent Number", patent_numbers)

    # Retrieve relevant information
    patent_info = train_df[train_df['patent_number'] == selected_patent].iloc[0]
    title = patent_info['title']
    abstract = patent_info['abstract']
    claims = patent_info['claims']
    background = patent_info['background']
    summary = patent_info['summary']
    description = patent_info['description']
    cpc_label = patent_info['cpc_label']
    ipc_label = patent_info['ipc_label']
    filing_date = patent_info['filing_date']
    patent_issue_date = patent_info['patent_issue_date']
    date_published = patent_info['date_published']
    examiner_id = patent_info['examiner_id']

    # Display the information
    st.markdown("### Title")
    st.markdown(f"**{title}**")
    
    st.markdown("---")
    
    st.markdown("### Abstract")
    st.text_area("Abstract", abstract, height=150)
    
    st.markdown("---")
    
    st.markdown("### Claims")
    st.text_area("Claims", claims, height=150)
    
    st.markdown("---")
    
    st.markdown("### Background")
    st.text_area("Background", background, height=150)
    
    st.markdown("---")
    
    st.markdown("### Summary")
    st.text_area("Summary", summary, height=150)
    
    st.markdown("---")
    
    st.markdown("### Description")
    st.text_area("Description", description, height=150)
    
    st.markdown("---")
    
    st.markdown("### CPC Label")
    st.markdown(f"**{cpc_label}**")
    
    st.markdown("### IPC Label")
    st.markdown(f"**{ipc_label}**")
    
    st.markdown("### Filing Date")
    st.markdown(f"**{filing_date}**")
    
    st.markdown("### Patent Issue Date")
    st.markdown(f"**{patent_issue_date}**")
    
    st.markdown("### Date Published")
    st.markdown(f"**{date_published}**")
    
    st.markdown("### Examiner ID")
    st.markdown(f"**{examiner_id}**")

    # Submit button
    if st.button("Get Patentability Score"):
        # Prepare the input text
        input_text = f"{title} {abstract} {claims} {background} {summary} {description}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

        # Get the model prediction
        with torch.no_grad():
            logits = model(**inputs).logits
            predictions = torch.argmax(logits, dim=-1)

        # Display the patentability score
        decision_labels = ['REJECTED', 'ACCEPTED', 'PENDING', 'CONT-REJECTED', 'CONT-ACCEPTED', 'CONT-PENDING']
        score = decision_labels[predictions.item()]
        st.success(f"Patentability Score: **{score}**")