Spaces:

vectara
/

media-demo

Sleeping

App Files Files Community

ofermend commited on May 27, 2024

Commit

2d02ed4

verified ·

1 Parent(s): 0e3c0ad

Upload 4 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
app.py +89 -71
header-image-1.png +3 -0
query.py +66 -16

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 header-image-2.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 header-image-2.png filter=lfs diff=lfs merge=lfs -text
+header-image-1.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from omegaconf import OmegaConf
 from query import VectaraQuery
 import os
-import requests
 import streamlit as st
 from PIL import Image
 def inject_custom_css():
     st.markdown(
@@ -17,12 +19,18 @@ def inject_custom_css():
             color: #333;
         }
         body {
             padding-top: 0px;
         }
         .stApp {
-            padding-top: 10px;
         }
         .stButton>button {
             background-color: #4CAF50;
             color: white;
             padding: 10px 24px;
@@ -61,80 +69,90 @@ def inject_custom_css():
         .css-1d391kg { /* This targets the sidebar headings */
             color: #333 !important;
         }
-        .form-container {
-            display: flex;
-            justify-content: space-between;
-            align-items: center;
-        }
-        .form-container .stTextInput {
-            flex: 1;
-        }
-        .form-container .stButton {
-            margin-left: 10px;
-        }
         </style>
         """,
         unsafe_allow_html=True
     )
-def launch_bot():
-    if 'cfg' not in st.session_state:
-        cfg = OmegaConf.create({
-            'customer_id': str(os.environ['VECTARA_CUSTOMER_ID']),
-            'corpus_id': str(os.environ['VECTARA_CORPUS_ID']),
-            'api_key': str(os.environ['VECTARA_API_KEY']),
-            'streaming': False
-        })
-        st.session_state.cfg = cfg
-        st.session_state.vq = VectaraQuery(cfg.api_key, cfg.customer_id, [cfg.corpus_id],
-                                           "vectara-summary-ext-24-05-large")
-    cfg = st.session_state.cfg
-    vq = st.session_state.vq
-    st.set_page_config(page_title="Media Demo", layout="wide")
-    inject_custom_css()
-    header_image = Image.open('header-image-2.png')
-    cropped_image = header_image.crop((0, 0, header_image.width, 200))
-    st.image(cropped_image, use_column_width=True)
-    # left side content
-    with st.sidebar:
-        image = Image.open('vectara-logo.png')
-        st.markdown("## Welcome to Media Demo\n\n"
-                    "This demo uses Vectara to find the movie where a quote is from\n\n"
-                    "Covers movies from this [playlist](https://www.youtube.com/playlist?list=PLHPTxTxtC0ibVZrT2_WKWUl2SAxsKuKwx) of free movies")
-        st.markdown("---")
-        st.markdown(
-            "## How this works?\n"
-            "This app was built with [Vectara](https://vectara.com).\n"
-        )
-        st.markdown("---")
-        st.image(image, width=250)
-    st.markdown("<center> <h3>\"Find that movie\" demo</h3> </center>", unsafe_allow_html=True)
-    st.markdown('<div class="form-container">', unsafe_allow_html=True)
-    with st.form(key='my_form'):
-        question = st.text_input("Enter your movie quote:")
-        submit_button = st.form_submit_button(label='Find the Match')
-    st.markdown('</div>', unsafe_allow_html=True)
-    if submit_button and len(question) > 5:
-        movie_name, match_url, score = vq.submit_query(question)
-        if score < 0.7:
-            st.write("Sorry, I couldn't find a match for that quote. Please try another one.")
-        else:
-            video_url, start_time = match_url.split('&t=')
-            start_time = start_time[:-1]   # remove the trailing 's'
-            col1, col2, col3 = st.columns([1, 2, 1])
-            with col2:
-                st.write(f"Here's a useful video for you: {movie_name}")
-                st.video(video_url, start_time=int(float(start_time)))
 if __name__ == "__main__":
-    launch_bot()

 from omegaconf import OmegaConf
 from query import VectaraQuery
 import os
 import streamlit as st
 from PIL import Image
+import concurrent.futures
+SCORE_THRESHOLD = 0.7
 def inject_custom_css():
     st.markdown(
             color: #333;
         }
         body {
+            font-family: 'Roboto', sans-serif;
+            background-color: #f5f5f5;
+            color: #333;
             padding-top: 0px;
+            margin-top: 0px;
         }
         .stApp {
+            padding-top: 0px;
+            margin-top: 0px;
         }
         .stButton>button {
+            margin-top: 25px;
             background-color: #4CAF50;
             color: white;
             padding: 10px 24px;
         .css-1d391kg { /* This targets the sidebar headings */
             color: #333 !important;
         }
         </style>
         """,
         unsafe_allow_html=True
     )
+def fetch_summary(vq, matching_text, doc_id):
+    return vq.get_summary(matching_text, doc_id)
+def launch_app():
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        if 'cfg' not in st.session_state:
+            cfg = OmegaConf.create({
+                'customer_id': str(os.environ['VECTARA_CUSTOMER_ID']),
+                'corpus_id': str(os.environ['VECTARA_CORPUS_ID']),
+                'api_key': str(os.environ['VECTARA_API_KEY']),
+                'streaming': False
+            })
+            st.session_state.cfg = cfg
+            st.session_state.vq = VectaraQuery(cfg.api_key, cfg.customer_id, [cfg.corpus_id])
+        cfg = st.session_state.cfg
+        vq = st.session_state.vq
+        st.set_page_config(page_title="Media Demo", layout="wide")
+        inject_custom_css()
+        header_image = Image.open('header-image-2.png')
+        cropped_image = header_image.crop((0, 0, header_image.width, 150))
+        st.image(cropped_image, use_column_width=True)
+        # left side content
+        with st.sidebar:
+            image = Image.open('vectara-logo.png')
+            st.markdown("## Welcome to Media Demo\n\n"
+                        "This demo uses Vectara to find the movie where a quote is from.\n\n"
+                        "Covers movies from this [playlist](https://www.youtube.com/playlist?list=PLHPTxTxtC0ibVZrT2_WKWUl2SAxsKuKwx) of free movies.")
+            st.markdown("---")
+            st.markdown(
+                "## How this works?\n"
+                "This app was built with [Vectara](https://vectara.com).\n"
+            )
+            st.markdown("---")
+            st.image(image, width=250)
+        st.markdown("<center> <h3>\"Where did I hear that line?\"</h3> </center>", unsafe_allow_html=True)
+        _, q_col, _ = st.columns([1, 4, 1])
+        with q_col:
+            quote = st.text_input("quote", label_visibility="hidden", placeholder="Enter a quote from a movie.")
+            prev_quote = st.session_state.get('prev_quote', '')
+            if quote != prev_quote:
+                st.session_state.quote = quote
+                st.session_state.prev_quote = quote
+                st.session_state.movie_name, st.session_state.match_url, st.session_state.score, doc_id, matching_text = vq.submit_query(quote)
+                if st.session_state.score < SCORE_THRESHOLD:
+                    st.session_state.movie_name = None
+                else:
+                    future = executor.submit(fetch_summary, vq, matching_text, doc_id)
+                    st.session_state.summary_future = future
+        if 'score' in st.session_state and st.session_state.score:
+            if st.session_state.movie_name is None:
+                st.write("Sorry, I couldn't find a match for that quote. Please try another one.")
+            else:
+                video_url, start_time = st.session_state.match_url.split('&t=')
+                video_url = f"{video_url}&cc_load_policy=1"
+                start_time = start_time[:-1]  # remove the trailing 's'
+                _, video_col, summary_col = st.columns([1, 4, 3])
+                with video_col:
+                    st.video(video_url, start_time=int(float(start_time)))
+                with summary_col:
+                    # Display the summary when it's ready
+                    if 'summary_future' in st.session_state:
+                        if st.session_state.summary_future.done():
+                            st.markdown("**Summary:**")
+                            st.session_state.summary = st.session_state.summary_future.result()
+                            st.markdown(st.session_state.summary)
+            if not st.session_state.summary_future.done():
+                st.rerun()
 if __name__ == "__main__":
+    launch_app()

header-image-1.png ADDED Viewed

Git LFS Details

SHA256: b5d85c71049219b41d52abd59f3f0ee00157967ae90e3f9aa2dfca258072813d
Pointer size: 132 Bytes
Size of remote file: 1.8 MB

query.py CHANGED Viewed

@@ -1,36 +1,67 @@
 import requests
 import json
-import re
 class VectaraQuery():
-    def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
         self.customer_id = customer_id
         self.corpus_ids = corpus_ids
         self.api_key = api_key
-        self.prompt_name = prompt_name if prompt_name else "vectara-experimental-summary-ext-2023-12-11-sml"
-    def get_body(self, query_str: str):
         corpora_key_list = [{
-                'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.005}
             } for corpus_id in self.corpus_ids
         ]
-        return {
             'query': [
                 {
                     'query': query_str,
                     'start': 0,
-                    'numResults': 10,
                     'corpusKey': corpora_key_list,
-                    'context_config': {
-                        'sentences_before': 2,
-                        'sentences_after': 2,
-                        'start_tag': "%START_SNIPPET%",
-                        'end_tag': "%END_SNIPPET%",
                     },
-                    'rerankingConfig': { 'rerankerId': 272725719 }
                 }
             ]
         }
     def get_headers(self):
         return {
@@ -44,7 +75,7 @@ class VectaraQuery():
     def submit_query(self, query_str: str):
         endpoint = "https://api.vectara.io/v1/query"
-        body = self.get_body(query_str)
         response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
         if response.status_code != 200:
@@ -56,11 +87,12 @@ class VectaraQuery():
         responses = res['responseSet'][0]['response'][:top_k]
         documents = res['responseSet'][0]['document']
         metadatas = []
         for x in responses:
             md = {m["name"]: m["value"] for m in x["metadata"]}
             doc_num = x["documentIndex"]
             doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
             md.update(doc_md)
             metadatas.append(md)
@@ -68,6 +100,24 @@ class VectaraQuery():
         movie_title = metadatas[0].get("doc_title", None)
         snippet_url = metadatas[0].get("url", None)
         score = responses[0]["score"]
-        return movie_title, snippet_url, score

 import requests
 import json
 class VectaraQuery():
+    def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str]):
         self.customer_id = customer_id
         self.corpus_ids = corpus_ids
         self.api_key = api_key
+        self.START_TAG = "<em_start>"
+        self.END_TAG = "<em_end>"
+        self.prompt_name = "vectara-summary-ext-24-05-med"
+        self.prompt_text = '''
+        [{"role": "system", "content": "Follow these detailed step-by-step instructions, your task is to generate an accurate and coherent summary of the first search result.
+        - You will receive a single search result enclosed in triple quotes, which includes part of a script from a movie.
+        - the search result can be a part of a larger movie scence, and may be incomplete.
+        - the text is a sequence of subtitles from the movie itself.
+        - Base your summary only on the information provided in the search result, do not use any other sources.
+        - Do no include the word summary in your response, just the summary itself.
+        - Summarize the scene including who the characters are, what they do and any other important detail."},
+        {"role": "user", "content": "#foreach ($qResult in $vectaraQueryResults) Search Result $esc.java($foreach.index + 1): \'\'\'$esc.java($qResult.text())\'\'\'.#end"}
+        ]
+        '''
+    def get_body(self, query_str: str, filter: str = None, summarize: bool = True):
         corpora_key_list = [{
+                'customerId': self.customer_id, 'corpusId': corpus_id, 'lexicalInterpolationConfig': {'lambda': 0.005}
             } for corpus_id in self.corpus_ids
         ]
+        if filter:
+            for key in corpora_key_list:
+                key['filter'] = filter
+        sent_before = 15 if summarize else 1
+        sent_after = 15 if summarize else 1
+        body = {
             'query': [
                 {
                     'query': query_str,
                     'start': 0,
+                    'numResults': 50,
                     'corpusKey': corpora_key_list,
+                    'contextConfig': {
+                        'sentences_before': sent_before,
+                        'sentences_after': sent_after,
+                        'start_tag': self.START_TAG,
+                        'end_tag': self.END_TAG
                     },
                 }
             ]
         }
+        if summarize:
+            body['query'][0]['summary'] = [
+                {
+                    'responseLang': 'eng',
+                    'maxSummarizedResults': 1,
+                    'summarizerPromptName': self.prompt_name,
+                    'promptText': self.prompt_text
+                }
+            ]
+        else:
+            body['query'][0]['rerankingConfig'] = { 'rerankerId': 272725719 }  # rerank only in main query, not when summarizing
+        return body
     def get_headers(self):
         return {
     def submit_query(self, query_str: str):
         endpoint = "https://api.vectara.io/v1/query"
+        body = self.get_body(query_str, filter=None, summarize=False)
         response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
         if response.status_code != 200:
         responses = res['responseSet'][0]['response'][:top_k]
         documents = res['responseSet'][0]['document']
         metadatas = []
         for x in responses:
             md = {m["name"]: m["value"] for m in x["metadata"]}
             doc_num = x["documentIndex"]
+            doc_id = documents[doc_num]["id"]
+            md['doc_id'] = doc_id
             doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
             md.update(doc_md)
             metadatas.append(md)
         movie_title = metadatas[0].get("doc_title", None)
         snippet_url = metadatas[0].get("url", None)
         score = responses[0]["score"]
+        doc_id = metadatas[0]["doc_id"]
+        matching_text = responses[0]["text"].split(self.START_TAG)[1].split(self.END_TAG)[0].strip()
+        return movie_title, snippet_url, score, doc_id, matching_text
+    def get_summary(self, query_str: str, doc_id: str):
+        endpoint = "https://api.vectara.io/v1/query"
+        filter = f"doc.id == '{doc_id}'"
+        body = self.get_body(query_str, filter, summarize=True)
+        response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
+        if response.status_code != 200:
+            print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
+            return "Sorry, something went wrong in my brain. Please try again later."
+        res = response.json()
+        summary = res['responseSet'][0]['summary'][0]['text']
+        return summary