fartinalbania commited on
Commit
d3ee75b
·
verified ·
1 Parent(s): 935d849

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -8
app.py CHANGED
@@ -1,4 +1,94 @@
1
- #STABLE ARCHITECTURE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import math
4
  import torch
@@ -331,10 +421,10 @@ class MemoryEfficientMoE(nn.Module):
331
  # 7) Enhanced Transformer Block with Hybrid Attention & DeepNorm
332
  # ------------------------------------------------------------------------
333
  class EnhancedHybridBlock(nn.Module):
334
- """
335
  Transformer block with hybrid attention and DeepNorm residual scaling.
336
  Depending on `attn_type`, it uses either lightning attention or (placeholder) softmax attention.
337
- """
338
  def __init__(self, config: MiniMaxConfig, layer_idx: int, attn_type: str = "lightning"):
339
  super().__init__()
340
  self.config = config
@@ -520,7 +610,7 @@ encoding = tiktoken.Encoding(
520
  )
521
 
522
  pad_token_id = special_tokens_dict["<|pad|>"]
523
- """def load_model(model_dir="./"):
524
  global model
525
  if model is not None:
526
  return model
@@ -590,7 +680,7 @@ async def chat_endpoint(request: ChatRequest):
590
  response_text = generate_response(request.messages)
591
  return ChatResponse(response=response_text)
592
  except Exception as e:
593
- return {"error": str(e)}"""
594
  # ---------------------------
595
  def load_model(model_dir="./"):
596
  global model
@@ -672,7 +762,7 @@ app.add_middleware(
672
  allow_headers=["*"],
673
  )
674
  def get_device():
675
- """Return GPU device if available, else CPU."""
676
  return torch.device("cuda" if torch.cuda.is_available() else "cpu")
677
  # Global model variable
678
  model_config = MiniMaxConfig(
@@ -704,7 +794,7 @@ class ChatResponse(BaseModel):
704
  status: str = "success"
705
 
706
  async def ensure_model_loaded():
707
- """Ensure model is loaded before processing requests"""
708
  global model
709
  if model is None:
710
  try:
@@ -744,7 +834,7 @@ async def chat_endpoint(request: ChatRequest):
744
 
745
  @app.get("/api/health")
746
  async def health_check():
747
- """Health check endpoint"""
748
  return {"status": "healthy"}
749
 
750
  import gradio as gr
@@ -803,3 +893,4 @@ app = gr.mount_gradio_app(app, iface, path="/")
803
 
804
  if __name__ == "__main__":
805
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ import torch
6
+ import logging
7
+ import gradio as gr
8
+ import uvicorn
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ app = FastAPI()
15
+
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"],
19
+ allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+ MODEL_ID = "tugstugi/Qwen2.5-Coder-0.5B-QwQ-draft"
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
28
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(device)
29
+
30
+ class ChatMessage(BaseModel):
31
+ role: str
32
+ content: str
33
+
34
+ class ChatRequest(BaseModel):
35
+ messages: list[ChatMessage]
36
+
37
+ class ChatResponse(BaseModel):
38
+ response: str
39
+ status: str = "success"
40
+
41
+ def build_prompt(messages):
42
+ prompt = ""
43
+ for message in messages:
44
+ if message["role"] == "user":
45
+ prompt += f"<|im_start|>user\n{message['content']}<|im_end|>\n"
46
+ elif message["role"] == "assistant":
47
+ prompt += f"<|im_start|>assistant\n{message['content']}<|im_end|>\n"
48
+ prompt += "<|im_start|>assistant\n"
49
+ return prompt
50
+
51
+ def generate_response(conversation_history, max_new_tokens=150):
52
+ prompt_text = build_prompt(conversation_history)
53
+
54
+ inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
55
+
56
+ generated_ids = model.generate(
57
+ **inputs,
58
+ max_new_tokens=max_new_tokens,
59
+ do_sample=True,
60
+ temperature=0.8,
61
+ top_p=0.95,
62
+ pad_token_id=tokenizer.eos_token_id
63
+ )
64
+
65
+ generated_text = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
66
+
67
+ return generated_text.strip()
68
+
69
+ @app.post("/api/chat", response_model=ChatResponse)
70
+ async def chat_endpoint(request: ChatRequest):
71
+ try:
72
+ conversation = [{"role": msg.role, "content": msg.content} for msg in request.messages]
73
+ response_text = generate_response(conversation)
74
+ return ChatResponse(response=response_text)
75
+ except Exception as e:
76
+ logger.error(f"Error: {str(e)}")
77
+ raise HTTPException(status_code=500, detail=str(e))
78
+
79
+ @app.get("/api/health")
80
+ async def health_check():
81
+ return {"status": "healthy"}
82
+
83
+ # Gradio setup
84
+ iface = gr.Interface(fn=lambda input: generate_response([{"role": "user", "content": input}]),
85
+ inputs="text", outputs="text")
86
+ app = gr.mount_gradio_app(app, iface, path="/")
87
+
88
+ if __name__ == "__main__":
89
+ uvicorn.run(app, host="0.0.0.0", port=7860)
90
+
91
+ """#STABLE ARCHITECTURE
92
 
93
  import math
94
  import torch
 
421
  # 7) Enhanced Transformer Block with Hybrid Attention & DeepNorm
422
  # ------------------------------------------------------------------------
423
  class EnhancedHybridBlock(nn.Module):
424
+ "
425
  Transformer block with hybrid attention and DeepNorm residual scaling.
426
  Depending on `attn_type`, it uses either lightning attention or (placeholder) softmax attention.
427
+ "
428
  def __init__(self, config: MiniMaxConfig, layer_idx: int, attn_type: str = "lightning"):
429
  super().__init__()
430
  self.config = config
 
610
  )
611
 
612
  pad_token_id = special_tokens_dict["<|pad|>"]
613
+ def load_model(model_dir="./"):
614
  global model
615
  if model is not None:
616
  return model
 
680
  response_text = generate_response(request.messages)
681
  return ChatResponse(response=response_text)
682
  except Exception as e:
683
+ return {"error": str(e)}
684
  # ---------------------------
685
  def load_model(model_dir="./"):
686
  global model
 
762
  allow_headers=["*"],
763
  )
764
  def get_device():
765
+ ""Return GPU device if available, else CPU.
766
  return torch.device("cuda" if torch.cuda.is_available() else "cpu")
767
  # Global model variable
768
  model_config = MiniMaxConfig(
 
794
  status: str = "success"
795
 
796
  async def ensure_model_loaded():
797
+ Ensure model is loaded before processing requests"
798
  global model
799
  if model is None:
800
  try:
 
834
 
835
  @app.get("/api/health")
836
  async def health_check():
837
+ "Health check endpoint""
838
  return {"status": "healthy"}
839
 
840
  import gradio as gr
 
893
 
894
  if __name__ == "__main__":
895
  uvicorn.run(app, host="0.0.0.0", port=7860)
896
+ """