Spaces:

hmrizal
/

CSVBot-OpenSource

Sleeping

App Files Files Community

hmrizal commited on Apr 28

Commit

0cca13a

verified ·

1 Parent(s): 93d9ba0

modify create_fallback_pipeline and initialize_model_once without CUDA

Browse files

Files changed (1) hide show

app.py +62 -31

app.py CHANGED Viewed

@@ -160,21 +160,35 @@ def initialize_model_once(model_key):
                 # Handle standard HF models
                 else:
-                    quantization_config = BitsAndBytesConfig(
-                        load_in_4bit=True,
-                        bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                        bnb_4bit_quant_type="nf4",
-                        bnb_4bit_use_double_quant=True
-                    )
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                    MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
-                        model_name,
-                        quantization_config=quantization_config,
-                        torch_dtype=model_info["dtype"],
-                        device_map="auto" if torch.cuda.is_available() else None,
-                        low_cpu_mem_usage=True,
-                        trust_remote_code=True
-                    )
                     MODEL_CACHE["is_gguf"] = False
                 print(f"Model {model_name} loaded successfully")
@@ -258,24 +272,41 @@ def create_llm_pipeline(model_key):
 def create_fallback_pipeline():
     """Create a fallback pipeline with a very small model"""
     model_key = "Fallback Model"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG[model_key]["name"])
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_CONFIG[model_key]["name"],
-        torch_dtype=MODEL_CONFIG[model_key]["dtype"],
-        device_map="auto" if torch.cuda.is_available() else None,
-        low_cpu_mem_usage=True
-    )
-    pipe = pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        max_new_tokens=128,
-        temperature=0.3,
-        return_full_text=False,
-    )
-    return HuggingFacePipeline(pipeline=pipe)
 def handle_model_loading_error(model_key, session_id):
     """Handle model loading errors with fallback options"""

                 # Handle standard HF models
                 else:
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                    # Only use quantization if CUDA is available
+                    if torch.cuda.is_available():
+                        quantization_config = BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_compute_dtype=torch.float16,
+                            bnb_4bit_quant_type="nf4",
+                            bnb_4bit_use_double_quant=True
+                        )
+                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
+                            model_name,
+                            quantization_config=quantization_config,
+                            torch_dtype=model_info["dtype"],
+                            device_map="auto",
+                            low_cpu_mem_usage=True,
+                            trust_remote_code=True
+                        )
+                    else:
+                        # For CPU-only environments, load without quantization
+                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
+                            model_name,
+                            torch_dtype=torch.float32,  # Use float32 for CPU
+                            device_map=None,
+                            low_cpu_mem_usage=True,
+                            trust_remote_code=True
+                        )
                     MODEL_CACHE["is_gguf"] = False
                 print(f"Model {model_name} loaded successfully")
 def create_fallback_pipeline():
     """Create a fallback pipeline with a very small model"""
     model_key = "Fallback Model"
+    print(f"Creating minimal fallback pipeline with {MODEL_CONFIG[model_key]['name']}")
+    # Avoid using bitsandbytes for quantization when CUDA is not available
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG[model_key]["name"])
+        # Load model in 8-bit or without quantization for CPU
+        if torch.cuda.is_available():
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_CONFIG[model_key]["name"],
+                torch_dtype=MODEL_CONFIG[model_key]["dtype"],
+                device_map="auto",
+                low_cpu_mem_usage=True
+            )
+        else:
+            # For CPU-only environments, avoid quantization
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_CONFIG[model_key]["name"],
+                torch_dtype=torch.float32,  # Use float32 for CPU
+                low_cpu_mem_usage=True
+            )
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=64,  # Reduced for CPU performance
+            temperature=0.3,
+            return_full_text=False,
+        )
+        return HuggingFacePipeline(pipeline=pipe)
+    except Exception as e:
+        print(f"Error creating minimal fallback pipeline: {str(e)}")
+        raise
 def handle_model_loading_error(model_key, session_id):
     """Handle model loading errors with fallback options"""