new_patent_app / app.py
rb757's picture
Add Streamlit app for patentability score prediction
b6898af
raw
history blame
4 kB
import streamlit as st
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from datasets import load_dataset
# Load model and tokenizer
model_path = "rb757/new_app"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Load the dataset
dataset_dict = load_dataset(
'HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather",
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
trust_remote_code=True
)
# Convert to DataFrame
train_df = pd.DataFrame(dataset_dict['train'])
val_df = pd.DataFrame(dataset_dict['validation'])
# Print columns to verify availability
print("Train set columns:", train_df.columns.tolist())
print("Validation set columns:", val_df.columns.tolist())
# Check if 'patent_number' exists
if 'patent_number' not in train_df.columns:
st.error("Column 'patent_number' not found in the training dataset.")
else:
# Title and description
st.title("πŸ“œ Milestone Patent Evaluation")
st.write("Select a patent application to evaluate its patentability.")
# Dropdown for patent numbers
patent_numbers = train_df['patent_number'].unique()
selected_patent = st.selectbox("Select Patent Number", patent_numbers)
# Retrieve relevant information
patent_info = train_df[train_df['patent_number'] == selected_patent].iloc[0]
title = patent_info['title']
abstract = patent_info['abstract']
claims = patent_info['claims']
background = patent_info['background']
summary = patent_info['summary']
description = patent_info['description']
cpc_label = patent_info['cpc_label']
ipc_label = patent_info['ipc_label']
filing_date = patent_info['filing_date']
patent_issue_date = patent_info['patent_issue_date']
date_published = patent_info['date_published']
examiner_id = patent_info['examiner_id']
# Display the information
st.markdown("### Title")
st.markdown(f"**{title}**")
st.markdown("---")
st.markdown("### Abstract")
st.text_area("Abstract", abstract, height=150)
st.markdown("---")
st.markdown("### Claims")
st.text_area("Claims", claims, height=150)
st.markdown("---")
st.markdown("### Background")
st.text_area("Background", background, height=150)
st.markdown("---")
st.markdown("### Summary")
st.text_area("Summary", summary, height=150)
st.markdown("---")
st.markdown("### Description")
st.text_area("Description", description, height=150)
st.markdown("---")
st.markdown("### CPC Label")
st.markdown(f"**{cpc_label}**")
st.markdown("### IPC Label")
st.markdown(f"**{ipc_label}**")
st.markdown("### Filing Date")
st.markdown(f"**{filing_date}**")
st.markdown("### Patent Issue Date")
st.markdown(f"**{patent_issue_date}**")
st.markdown("### Date Published")
st.markdown(f"**{date_published}**")
st.markdown("### Examiner ID")
st.markdown(f"**{examiner_id}**")
# Submit button
if st.button("Get Patentability Score"):
# Prepare the input text
input_text = f"{title} {abstract} {claims} {background} {summary} {description}"
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
# Get the model prediction
with torch.no_grad():
logits = model(**inputs).logits
predictions = torch.argmax(logits, dim=-1)
# Display the patentability score
decision_labels = ['REJECTED', 'ACCEPTED', 'PENDING', 'CONT-REJECTED', 'CONT-ACCEPTED', 'CONT-PENDING']
score = decision_labels[predictions.item()]
st.success(f"Patentability Score: **{score}**")