Spaces:

codelion
/

videoanalysis

Running

App Files Files Community

codelion commited on Apr 2

Commit

0425992

verified ·

1 Parent(s): 63595a8

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -11

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 if not GOOGLE_API_KEY:
     raise ValueError("Please set the GOOGLE_API_KEY environment variable.")
-# Initialize the Gemini API client via AI Studio using the API key.
 client = genai.Client(api_key=GOOGLE_API_KEY)
 # Use the Gemini 2.0 Flash model.
@@ -21,8 +21,8 @@ MODEL_NAME = "gemini-2.0-flash-001"
 def call_gemini(video_file: str, prompt: str) -> str:
     """
     Call the Gemini model with the provided video file and prompt.
-    The video file is read as bytes and passed with MIME type "video/mp4".
-    The prompt is passed as a plain string.
     """
     with open(video_file, "rb") as f:
         file_bytes = f.read()
@@ -30,7 +30,7 @@ def call_gemini(video_file: str, prompt: str) -> str:
         model=MODEL_NAME,
         contents=[
             Part(file_data=file_bytes, mime_type="video/mp4"),
-            prompt
         ]
     )
     return response.text
@@ -53,14 +53,15 @@ def get_key_frames(video_file: str, summary: str, user_query: str) -> list:
     Ask Gemini to output key timestamps and descriptions in plain text.
     The prompt instructs the model to output one line per event in the format:
     HH:MM:SS - description
-    We then parse these lines and extract frames using OpenCV.
     Returns a list of tuples: (image_array, caption)
     """
     prompt = (
-        "List the key timestamps in the video and a brief description of the important event at that time. "
         "Output one line per event in the following format: HH:MM:SS - description. Do not include any extra text."
     )
     prompt += f" Video Summary: {summary}"
     if user_query:
         prompt += f" Focus on: {user_query}"
@@ -103,15 +104,16 @@ def get_key_frames(video_file: str, summary: str, user_query: str) -> list:
 def analyze_video(video_file: str, user_query: str) -> (str, list):
     """
-    Perform a single-step video analysis on the uploaded file.
-    First, call Gemini to get a brief summary of the video.
-    Then, ask Gemini for key timestamps and descriptions.
     Returns:
-      - A Markdown report as a string.
       - A gallery list of key frames (each as a tuple of (image, caption)).
     """
-    summary_prompt = "Summarize this video in a few sentences, focusing on any security or surveillance insights."
     if user_query:
         summary_prompt += f" Also focus on: {user_query}"
     try:
@@ -119,6 +121,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
     except Exception as e:
         summary = f"[Error in summary extraction: {e}]"
     markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{summary}\n"
     key_frames_gallery = get_key_frames(video_file, summary, user_query)
     if not key_frames_gallery:
         markdown_report += "\n*No key frames were extracted.*\n"

 if not GOOGLE_API_KEY:
     raise ValueError("Please set the GOOGLE_API_KEY environment variable.")
+# Initialize the Gemini API client via AI Studio.
 client = genai.Client(api_key=GOOGLE_API_KEY)
 # Use the Gemini 2.0 Flash model.
 def call_gemini(video_file: str, prompt: str) -> str:
     """
     Call the Gemini model with the provided video file and prompt.
+    The video is read as bytes and passed with MIME type "video/mp4",
+    and the prompt is wrapped as a text part.
     """
     with open(video_file, "rb") as f:
         file_bytes = f.read()
         model=MODEL_NAME,
         contents=[
             Part(file_data=file_bytes, mime_type="video/mp4"),
+            Part(text=prompt)
         ]
     )
     return response.text
     Ask Gemini to output key timestamps and descriptions in plain text.
     The prompt instructs the model to output one line per event in the format:
     HH:MM:SS - description
+    We then parse these lines and extract the corresponding frames using OpenCV.
     Returns a list of tuples: (image_array, caption)
     """
     prompt = (
+        "List the key timestamps in the video and a brief description of the event at that time. "
         "Output one line per event in the following format: HH:MM:SS - description. Do not include any extra text."
     )
+    # Append the summary (and user query if provided) so the model has context.
     prompt += f" Video Summary: {summary}"
     if user_query:
         prompt += f" Focus on: {user_query}"
 def analyze_video(video_file: str, user_query: str) -> (str, list):
     """
+    Perform a single-step video analysis.
+    First, call Gemini with a simple prompt to get a brief summary.
+    Then, call Gemini to list key timestamps with descriptions.
     Returns:
+      - A Markdown report summarizing the video.
       - A gallery list of key frames (each as a tuple of (image, caption)).
     """
+    # Use a very simple prompt for summary.
+    summary_prompt = "Summarize this video."
     if user_query:
         summary_prompt += f" Also focus on: {user_query}"
     try:
     except Exception as e:
         summary = f"[Error in summary extraction: {e}]"
     markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{summary}\n"
     key_frames_gallery = get_key_frames(video_file, summary, user_query)
     if not key_frames_gallery:
         markdown_report += "\n*No key frames were extracted.*\n"