GainEnergy
/

OGAI-8x7b-Q4_K_M-GGUF

@@ -1,106 +0,0 @@
-import os
-from llama_cpp import Llama
-class EndpointHandler:
-    def __init__(self, model_dir):
-        """
-        Initialize the handler with the GGUF model directory.
-        """
-        self.model_dir = model_dir
-        self.model_path = os.path.join(model_dir, "ogai-8x7b-q4_k_m.gguf")  # Ensure GGUF model path is correct
-        self.context_window = 32768  # Supports up to 32K tokens for RAG jobs
-        self.max_new_tokens = 4086  # Limit token generation
-        # System Instructions for OGAI-8x7b
-        self.system_instructions = """
-        You are OGAI-8x7b, a specialized large language model trained for the oil and gas industry within the Upstrima AI ecosystem.
-        Your primary purpose is to assist engineers, technicians, analysts, and decision-makers with domain-specific knowledge,
-        calculations, and insights related to oil and gas operations. You must follow Upstrima's formatting, technical accuracy,
-        and safety prioritization in all responses.
-        **Core Capabilities:**
-        1. **Domain Expertise** – Upstream, Midstream, Downstream, Regulatory Compliance
-        2. **Engineering Calculations** – Production Optimization, Equipment Sizing, Troubleshooting
-        3. **Data Analysis** – Well Logs, Reservoir Data, Anomaly Detection, Risk Assessment
-        4. **Documentation** – Technical Reports, Regulatory Submissions, Training Materials
-        **Response Standards:**
-        - Use Markdown formatting for structured responses
-        - Show calculations with LaTeX equations
-        - Prioritize safety, compliance, and technical accuracy
-        """
-        # Greeting system response
-        self.greeting_response = "**Welcome to Upstrima AI!** 🚀\n\nI am OGAI-8x7b, your specialized assistant for oil & gas engineering. How can I assist you today?"
-        # Load GGUF model with memory optimization
-        self.load_model()
-    def load_model(self):
-        """
-        Load the GGUF model using llama-cpp-python.
-        Automatically offloads layers to GPU if available.
-        """
-        try:
-            self.model = Llama(
-                model_path=self.model_path,
-                n_ctx=self.context_window,  # Supports 32K tokens for RAG
-                n_gpu_layers=20,  # Offload 20 layers to GPU (adjust based on available memory)
-                verbose=False
-            )
-        except Exception as e:
-            raise RuntimeError(f"Failed to load model: {e}")
-    def detect_greeting(self, user_input):
-        """
-        Check if the user input is a greeting.
-        """
-        greetings = ["hi", "hello", "hey", "good morning", "good afternoon", "good evening", "greetings"]
-        return user_input.lower().strip() in greetings
-    def process_prompt(self, user_input):
-        """
-        Handles user input, ensuring structured responses.
-        If a greeting is detected, the system responds automatically before the LLM processes further requests.
-        """
-        if self.detect_greeting(user_input):
-            return f"{self.greeting_response}\n\n"  # Send a predefined system message
-        return f"{self.system_instructions}\n\nUser Query: {user_input}\n\nResponse:"
-    def __call__(self, payload):
-        """
-        Handles inference requests, supporting both direct text input and RAG queries.
-        """
-        if not hasattr(self, "model"):
-            return {"error": "Model is not loaded"}
-        # Extract input
-        if isinstance(payload, str):
-            prompt = payload
-        elif isinstance(payload, dict):
-            prompt = payload.get("inputs", "")
-            if not prompt:
-                return {"error": "Missing 'inputs' key in the payload"}
-        else:
-            return {"error": "Payload must be a string or dictionary"}
-        # Detect greetings and generate a system response if applicable
-        if self.detect_greeting(prompt):
-            return {"generated_text": self.greeting_response}
-        # Ensure system instructions are prepended for normal queries
-        full_prompt = self.process_prompt(prompt)
-        # Generate response using GGUF model
-        try:
-            output = self.model(
-                full_prompt,
-                max_tokens=self.max_new_tokens,
-                temperature=0.7,
-                top_p=0.9
-            )
-        except Exception as e:
-            return {"error": f"Inference error: {e}"}
-        return {"generated_text": output["choices"][0]["text"].strip()}