totob-1.5B_chat

Sleeping

App Files Files Community

broadfield-dev commited on Mar 13

Commit

2ed2a6c

verified ·

1 Parent(s): d739608

Create app.py

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# Define model details
+MODEL_REPO = "TheBloke/vicuna-13B-v1.5-16K-GGUF"  # You can swap this for Mistral-7B or another GGUF model
+MODEL_FILE = "vicuna-13b-v1.5-16k.Q4_K_M.gguf"    # 4-bit quantized model file
+# Download the quantized model from Hugging Face
+model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+# Load the model with llama.cpp for CPU-only inference
+llm = Llama(
+    model_path=model_path,
+    n_gpu_layers=0,        # Set to 0 for CPU-only
+    n_threads=4,           # Adjust based on CPU cores (e.g., 4 for quad-core)
+    n_batch=512,           # Batch size for inference
+    n_ctx=2048,            # Context length (adjust based on RAM; 2048 fits ~16 GB)
+    verbose=False          # Reduce logging for cleaner output
+)
+# Define the inference function
+def generate_text(prompt, max_tokens=256, temperature=0.8, top_p=0.95):
+    try:
+        output = llm(
+            prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            repeat_penalty=1.1
+        )
+        return output["choices"][0]["text"].strip()
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+interface = gr.Interface(
+    fn=generate_text,
+    inputs=[
+        gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
+        gr.Slider(label="Max Tokens", minimum=50, maximum=512, value=256, step=10),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1),
+        gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.95, step=0.05)
+    ],
+    outputs=gr.Textbox(label="Generated Text"),
+    title="Quantized LLM on Hugging Face Spaces",
+    description="Run a 4-bit quantized Vicuna-13B model on CPU using llama.cpp",
+    theme="default"
+)
+# Launch the app
+if __name__ == "__main__":
+    interface.launch(server_name="0.0.0.0", server_port=7860)