import spaces import json import subprocess from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider import gradio as gr from huggingface_hub import hf_hub_download import logging import time logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" filename = "Meta-Llama-3-8B-Instruct.Q8_0.gguf" try: start_time = time.time() logger.info("Downloading Model....") hf_hub_download( repo_id = repo_id , filename = filename, local_dir="./model" ) end_time = time.time() logger.info(f"Download complete. Time taken : {start_time - end_time} seconds.") except Exception as e: logger.error(f"Unable to download Model : {e}") raise llm = None llm_model = None @spaces.GPU(duration=120) def respond( message, model, system_message, max_tokens, temperature, ): chat_template = MessagesFormatterType.LLAMA_3 global llm global llm_model if llm is None or llm_model != model: llm = Llama( model_path=f"models/{model}", flash_attn=True, n_gpu_layers=-1, n_batch=1024, n_ctx=8192, ) llm_model = model provider = LlamaCppPythonProvider(llm) agent = LlamaCppAgent( provider, system_prompt=f"{system_message}", predefined_messages_formatter_type=chat_template, debug_output=True ) settings = provider.get_provider_default_settings() settings.temperature = temperature settings.max_tokens = max_tokens settings.stream = True stream = agent.get_chat_response( message, llm_sampling_settings=settings, returns_streaming_generator=True, print_output=False ) outputs = "" for output in stream: outputs += output yield outputs DESCRIPTION = '''
Enter the text extracted from the PDF: