from typing import Any, Dict, List from langchain_core.messages import AnyMessage, AIMessage, HumanMessage def get_research_topic(messages: List[AnyMessage]) -> str: """ Get the research topic from the messages. """ # check if request has a history and combine the messages into a single string if len(messages) == 1: research_topic = messages[-1].content else: research_topic = "" for message in messages: if isinstance(message, HumanMessage): research_topic += f"User: {message.content}\n" elif isinstance(message, AIMessage): research_topic += f"Assistant: {message.content}\n" return research_topic def resolve_urls(urls_to_resolve: List[Any], id: int) -> Dict[str, str]: """ Create a map of the vertex ai search urls (very long) to a short url with a unique id for each url. Ensures each original URL gets a consistent shortened form while maintaining uniqueness. """ prefix = f"https://vertexaisearch.cloud.google.com/id/" urls = [site.web.uri for site in urls_to_resolve] # Create a dictionary that maps each unique URL to its first occurrence index resolved_map = {} for idx, url in enumerate(urls): if url not in resolved_map: resolved_map[url] = f"{prefix}{id}-{idx}" return resolved_map def insert_citation_markers(text, citations_list): """ Inserts citation markers into a text string based on start and end indices. Args: text (str): The original text string. citations_list (list): A list of dictionaries, where each dictionary contains 'start_index', 'end_index', and 'segment_string' (the marker to insert). Indices are assumed to be for the original text. Returns: str: The text with citation markers inserted. """ # Sort citations by end_index in descending order. # If end_index is the same, secondary sort by start_index descending. # This ensures that insertions at the end of the string don't affect # the indices of earlier parts of the string that still need to be processed. sorted_citations = sorted( citations_list, key=lambda c: (c["end_index"], c["start_index"]), reverse=True ) modified_text = text for citation_info in sorted_citations: # These indices refer to positions in the *original* text, # but since we iterate from the end, they remain valid for insertion # relative to the parts of the string already processed. end_idx = citation_info["end_index"] marker_to_insert = "" for segment in citation_info["segments"]: marker_to_insert += f" [{segment['label']}]({segment['short_url']})" # Insert the citation marker at the original end_idx position modified_text = ( modified_text[:end_idx] + marker_to_insert + modified_text[end_idx:] ) return modified_text def get_citations(response, resolved_urls_map): """ Extracts and formats citation information from a Gemini model's response. This function processes the grounding metadata provided in the response to construct a list of citation objects. Each citation object includes the start and end indices of the text segment it refers to, and a string containing formatted markdown links to the supporting web chunks. Args: response: The response object from the Gemini model, expected to have a structure including `candidates[0].grounding_metadata`. It also relies on a `resolved_map` being available in its scope to map chunk URIs to resolved URLs. Returns: list: A list of dictionaries, where each dictionary represents a citation and has the following keys: - "start_index" (int): The starting character index of the cited segment in the original text. Defaults to 0 if not specified. - "end_index" (int): The character index immediately after the end of the cited segment (exclusive). - "segments" (list[str]): A list of individual markdown-formatted links for each grounding chunk. - "segment_string" (str): A concatenated string of all markdown- formatted links for the citation. Returns an empty list if no valid candidates or grounding supports are found, or if essential data is missing. """ citations = [] # Ensure response and necessary nested structures are present if not response or not response.candidates: return citations candidate = response.candidates[0] if ( not hasattr(candidate, "grounding_metadata") or not candidate.grounding_metadata or not hasattr(candidate.grounding_metadata, "grounding_supports") ): return citations for support in candidate.grounding_metadata.grounding_supports: citation = {} # Ensure segment information is present if not hasattr(support, "segment") or support.segment is None: continue # Skip this support if segment info is missing start_index = ( support.segment.start_index if support.segment.start_index is not None else 0 ) # Ensure end_index is present to form a valid segment if support.segment.end_index is None: continue # Skip if end_index is missing, as it's crucial # Add 1 to end_index to make it an exclusive end for slicing/range purposes # (assuming the API provides an inclusive end_index) citation["start_index"] = start_index citation["end_index"] = support.segment.end_index citation["segments"] = [] if ( hasattr(support, "grounding_chunk_indices") and support.grounding_chunk_indices ): for ind in support.grounding_chunk_indices: try: chunk = candidate.grounding_metadata.grounding_chunks[ind] resolved_url = resolved_urls_map.get(chunk.web.uri, None) citation["segments"].append( { "label": chunk.web.title.split(".")[:-1][0], "short_url": resolved_url, "value": chunk.web.uri, } ) except (IndexError, AttributeError, NameError): # Handle cases where chunk, web, uri, or resolved_map might be problematic # For simplicity, we'll just skip adding this particular segment link # In a production system, you might want to log this. pass citations.append(citation) return citations