import streamlit as st
from sentence_transformers import SentenceTransformer, util
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import time

def find_abstracts(soup):
  #df = pd.DataFrame(columns = ["identifier", "abstract"])
  id_list = []
  abs_list = []
  title_list = []

  for record in soup.find_all("csw:record"):
    id = record.find("dc:identifier")
    abs = record.find("dct:abstract")
    title = record.find("dc:title")

    # append id and abs to df
    #df = df.append([id.text, abs.text])
    id_list.append(id.text)
    title_list.append(title.text)

    if abs != None:
      abs_list.append(abs.text)
    else:
      abs_list.append("NA")

  return id_list, title_list, abs_list

def get_metadata():
  # Get the abstracts from Geoportal
  URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort"
  
  page = requests.get(URL)
  soup = BeautifulSoup(page.text, "lxml")
  
  id_list, title_list, abs_list = find_abstracts(soup)
  df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"])
  df.to_csv("./ncei-metadata.csv")

  return df
  
def show_model(query):
  path = "./ncei-metadata.csv"
  
  if os.path.exists(path):
  
    last_modified = os.path.getmtime(path)
    now = time.time()
    DAY = 86400
    
    if (now - last_modified > DAY):
      df = get_metadata()
      
    else:
      df = pd.read_csv(path)
  else:
    df = get_metadata()
    
  
  # Make the abstracts the docs
  docs_df = df[df["abstract"] != "NA"]
  docs = list(docs_df["abstract"])
  titles = list(docs_df["title"])
  
  # Query
  query = input("Enter your query: ")
  
  # predict on a search query for data
  
  #Load the model
  model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
  
  #Encode query and documents
  query_emb = model.encode(query)
  doc_emb = model.encode(docs)
  
  #Compute dot score between query and all document embeddings
  scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
  
  #Combine docs & scores
  doc_score_pairs = list(zip(docs, scores, titles))
  
  #Sort by decreasing score
  doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
  return doc_score_pairs
  
def main():
  st.title("Semantic Search for Datasets Using Sentence Transformers")
  st.write("A case study for the National Centers for Environmental Information (NCEI)")
  st.image("noaa_logo.png", width=150)
  
  st.write("## Goal: search for datasets in NCEI's Archive using natural language queries")
  st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)")
  
  st.image("pres-whatisnoaa.png")
  
  st.write("## The Problem Context")
  st.write("Uses service called OneStop for data search")
  st.write("**Problems:**")
  st.write("- Uses keyword search -- not robust to natural language queries")
  st.write("- Filtering options too specific for non-expert users")
  #st.image("pres-onestop.png")
  #st.image("pres-problems.png")
  
  st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)")
  st.image("pres-sentencetransformers.png")
  
  st.write("## Project Data")
  st.image("pres-metadata.png")
  
  st.write("## The Process")
  st.image("pres-creatingse.png")
  
  st.write("## Results and Demo")
  
  st.write("[Demo Notebook](https://github.com/myrandaGoesToSpace/semantic-search-datasets/blob/main/semantic_search.ipynb)")

  st.image("pres-futureplans.png")
  
  st.write("## Critical Analysis")
  st.write("- did not run with Streamlit text input")
  st.write("- only embeds the first 5000 datasets")
  st.write("- calculates embeddings for datasets with each run")

main()