Spaces:

DebopamC
/

Natual_Language-to-SQL-Qwen2.5-3B-FineTuned

Runtime error

App Files Files Community

DebopamC commited on Jan 18

Commit

aaaffa8

verified ·

1 Parent(s): 78cbb80

Upload 16 files

Browse files

Files changed (5) hide show

static/database_scema.txt +2 -2
static/default_questions.txt +28 -21
utils/__pycache__/llm_logic.cpython-312.pyc +0 -0
utils/llm_logic.py +21 -7
🤖SQL_Agent.py +7 -3

static/database_scema.txt CHANGED Viewed

@@ -55,8 +55,8 @@ Rows: 38279, Columns: 4
 CREATE TABLE orders (
     order_id VARCHAR(255) PRIMARY KEY,
     customer_id VARCHAR(255),
-    order_purchase_timestamp TIMESTAMP,
-    order_approved_at TIMESTAMP,
     FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
 );
 ```

 CREATE TABLE orders (
     order_id VARCHAR(255) PRIMARY KEY,
     customer_id VARCHAR(255),
+    order_purchase_timestamp VARCHAR(255),
+    order_approved_at VARCHAR(255),
     FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
 );
 ```

static/default_questions.txt CHANGED Viewed

@@ -132,6 +132,33 @@ These questions are generated by ChatGpt 4o. Copy and paste the questions in the
 ### Hard Questions
 1.
    **Question:**
    ```
    Write a Query to find the total revenue (sum of `price` + `shipping_charges`) generated for each product category in the `order_items` table, joined with the `products` table.
@@ -149,25 +176,5 @@ These questions are generated by ChatGpt 4o. Copy and paste the questions in the
    ORDER BY total_revenue DESC;
    ```
----
-2.
-   **Question:**
-   ```
-   Write a Query to identify the top 5 products with the highest total sales value ( sum of `price` ) across all orders.
-   ```
-   **Fine-Tuned Model Results:**
-   ❌ **Fail**
-   **Answer by ChatGpt 4o:**
-   ```sql
-   SELECT
-       product_id,
-       SUM(price) AS total_sales
-   FROM order_items
-   GROUP BY product_id
-   ORDER BY total_sales DESC
-   LIMIT 5;
-   ```
-   **Issue:** Misalignment with finer-grained filters or lack of handling for tied ranks.
----

 ### Hard Questions
 1.
+   **Question:**
+   ```
+   Select the most recent 1000 orders, their corresponding product details, customer details, and when they were purchased.
+   ```
+   **Fine-Tuned Model Results:**
+   ✅ **Pass**
+   **Answer by ChatGpt 4o:**
+   ```sql
+   SELECT
+       o.order_id,
+       o.order_purchase_timestamp,
+       c.customer_id,
+       c.customer_city,
+       c.customer_state,
+       oi.product_id,
+       p.product_category_name,
+       oi.price,
+       oi.shipping_charges
+   FROM orders o
+   JOIN customers c ON o.customer_id = c.customer_id
+   JOIN order_items oi ON o.order_id = oi.order_id
+   JOIN products p ON oi.product_id = p.product_id
+   ORDER BY o.order_purchase_timestamp DESC
+   LIMIT 1000;
+   ```
+2.
    **Question:**
    ```
    Write a Query to find the total revenue (sum of `price` + `shipping_charges`) generated for each product category in the `order_items` table, joined with the `products` table.
    ORDER BY total_revenue DESC;
    ```
+---

utils/__pycache__/llm_logic.cpython-312.pyc CHANGED Viewed

Binary files a/utils/__pycache__/llm_logic.cpython-312.pyc and b/utils/__pycache__/llm_logic.cpython-312.pyc differ

utils/llm_logic.py CHANGED Viewed

@@ -6,6 +6,8 @@ import multiprocessing
 from langchain_community.chat_models import ChatLlamaCpp
 from langchain_google_genai import ChatGoogleGenerativeAI
 local_model = "qwen2.5-coder-3b-instruct-q4_k_m.gguf"
 stop = [
@@ -31,7 +33,7 @@ stop = [
 def get_local_llm():
-    cache_llm = ChatLlamaCpp(
         temperature=0.0,
         model_path=local_model,
         n_ctx=10000,
@@ -39,11 +41,19 @@ def get_local_llm():
         n_batch=1024,
         max_tokens=500,
         n_threads=multiprocessing.cpu_count() - 1,
-        top_p=0.97,
         verbose=False,
         stop=stop,
     )
-    return cache_llm
 local_llm = get_local_llm()
@@ -51,14 +61,16 @@ local_llm = get_local_llm()
 def get_gemini_llm():
     gemini = ChatGoogleGenerativeAI(
-        model="gemini-1.5-flash",
         temperature=0,
         max_tokens=None,
         timeout=None,
         max_retries=2,
     )
     return gemini
 gemini_llm = get_gemini_llm()
@@ -115,13 +127,15 @@ Rows: 38279, Columns: 5
 | 27442 | hrjNaMt3Wyo5 | toys                    |               1850 |                  37 |                  22 |                 40 |
 Rows: 38279, Columns: 6
 """
 # Improved SQL generation prompt
 sql_system_prompt = """You are a highly skilled natural language to SQL translator. Your goal is to generate accurate SQL queries based on the provided database schema. You must only return the SQL query and no other text or explanations.
 DATABASE SCHEMA:
 {db_schema}
 """
 sql_chat_template = """
@@ -165,7 +179,7 @@ QUESTION: {question}
 """
-def classify_question(question: str, llm , use_default_schema: bool = True):
     classification_system_prompt_local = classification_system_prompt  # Initialize here
     if use_default_schema:
         classification_system_prompt_local = classification_system_prompt_local.format(
@@ -184,7 +198,7 @@ def classify_question(question: str, llm , use_default_schema: bool = True):
     return response.content.strip().upper()
-def generate_llm_response(prompt: str, llm: str,  use_default_schema: bool = True):
     if llm == "gemini":
         llm = gemini_llm

 from langchain_community.chat_models import ChatLlamaCpp
 from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_ollama import ChatOllama
 local_model = "qwen2.5-coder-3b-instruct-q4_k_m.gguf"
 stop = [
 def get_local_llm():
+    llm = ChatLlamaCpp(
         temperature=0.0,
         model_path=local_model,
         n_ctx=10000,
         n_batch=1024,
         max_tokens=500,
         n_threads=multiprocessing.cpu_count() - 1,
+        top_p=0.95,
         verbose=False,
         stop=stop,
     )
+    # llm = ChatOllama(
+    #     model="qwen2.5-coder:3b",
+    #     temperature=0.0,
+    #     num_predict=150,
+    #     top_p=0.95,
+    #     stop=stop,
+    # )
+    return llm
 local_llm = get_local_llm()
 def get_gemini_llm():
     gemini = ChatGoogleGenerativeAI(
+        model="gemini-2.0-flash-exp",
         temperature=0,
         max_tokens=None,
         timeout=None,
         max_retries=2,
+        top_p=0.95,
     )
     return gemini
 gemini_llm = get_gemini_llm()
 | 27442 | hrjNaMt3Wyo5 | toys                    |               1850 |                  37 |                  22 |                 40 |
 Rows: 38279, Columns: 6
 """
 # Improved SQL generation prompt
 sql_system_prompt = """You are a highly skilled natural language to SQL translator. Your goal is to generate accurate SQL queries based on the provided database schema. You must only return the SQL query and no other text or explanations.
 DATABASE SCHEMA:
 {db_schema}
+The timestamp columns are of type 'VarChar'. I am using DuckDB to execute the queries.
 """
 sql_chat_template = """
 """
+def classify_question(question: str, llm, use_default_schema: bool = True):
     classification_system_prompt_local = classification_system_prompt  # Initialize here
     if use_default_schema:
         classification_system_prompt_local = classification_system_prompt_local.format(
     return response.content.strip().upper()
+def generate_llm_response(prompt: str, llm: str, use_default_schema: bool = True):
     if llm == "gemini":
         llm = gemini_llm

🤖SQL_Agent.py CHANGED Viewed

@@ -66,6 +66,7 @@ st.markdown(
     unsafe_allow_html=True,
 )
 with st.popover("Click here to see Database Schema", use_container_width=True):
     uploaded_df_schema = st.session_state.get("uploaded_df_schema", False)
@@ -155,14 +156,14 @@ with col1:
 with col2:
     llm_option_radio = st.radio(
         "Choose LLM Model",
-        ["Gemini 1.5-Flash", "FineTuned Qwen2.5-Coder-3B for SQL"],
         captions=[
             "Used via API",
             "Run Locally on this Server. Extremely Slow because of Free vCPUs",
         ],
         label_visibility="collapsed",
     )
-    if llm_option_radio == "Gemini 1.5-Flash":
         llm_option = "gemini"
     else:
         llm_option = "qwen"
@@ -236,7 +237,7 @@ if st.session_state.conversation_turns < MAX_TURNS:
             spinner_text = ""
             if llm_option == "gemini":
                 spinner_text = (
-                    "Using Gemini-1.5-Flash to run your query. Please wait...😊"
                 )
             else:
                 spinner_text = "I know it is taking a lot of time. To run the model I'm using `Free` small vCPUs provided by `HuggingFace Spaces` for deployment. Thank you so much for your patience😊"
@@ -314,3 +315,6 @@ else:
     st.chat_input(
         "Ask me a SQL query question", disabled=True
     )  # Disable the input field

     unsafe_allow_html=True,
 )
 with st.popover("Click here to see Database Schema", use_container_width=True):
     uploaded_df_schema = st.session_state.get("uploaded_df_schema", False)
 with col2:
     llm_option_radio = st.radio(
         "Choose LLM Model",
+        ["Gemini-2.0-Flash-Exp", "FineTuned Qwen2.5-Coder-3B for SQL"],
         captions=[
             "Used via API",
             "Run Locally on this Server. Extremely Slow because of Free vCPUs",
         ],
         label_visibility="collapsed",
     )
+    if llm_option_radio == "Gemini-2.0-Flash-Exp":
         llm_option = "gemini"
     else:
         llm_option = "qwen"
             spinner_text = ""
             if llm_option == "gemini":
                 spinner_text = (
+                    "Using Gemini-2.0-Flash-Exp to run your query. Please wait...😊"
                 )
             else:
                 spinner_text = "I know it is taking a lot of time. To run the model I'm using `Free` small vCPUs provided by `HuggingFace Spaces` for deployment. Thank you so much for your patience😊"
     st.chat_input(
         "Ask me a SQL query question", disabled=True
     )  # Disable the input field
+with st.sidebar:
+    st.caption("Made with ❤️ by @Debopam_Chowdhury")