MUHAMMAD YOUSAF RANA commited on
Commit
947f13a
·
1 Parent(s): da5c620

files added

Browse files
Files changed (4) hide show
  1. Dockerfile +34 -0
  2. main.py +116 -0
  3. my_lib/preproces_video.py +27 -0
  4. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use lightweight Python image
2
+ FROM python:3.10-slim
3
+
4
+ # Install OS dependencies for PyAV and image processing
5
+ RUN apt-get update && apt-get install -y \
6
+ ffmpeg \
7
+ libsm6 \
8
+ libxext6 \
9
+ libgl1 \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Create non-root user
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+
17
+ # Set up environment and working directory
18
+ ENV PATH="/home/user/.local/bin:$PATH"
19
+ WORKDIR /app
20
+
21
+ # Install Python dependencies
22
+ COPY --chown=user requirements.txt requirements.txt
23
+ RUN pip install --no-cache-dir --upgrade pip \
24
+ && pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Copy application code
27
+ COPY --chown=user main.py /app/
28
+ COPY --chown=user my_lib /app/my_lib
29
+
30
+ # Required for Spaces: expose port 7860
31
+ EXPOSE 7860
32
+
33
+ # Run FastAPI app with Uvicorn
34
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ from fastapi.responses import JSONResponse
3
+ import traceback
4
+ import tempfile
5
+ import torch
6
+ # import mimetypes
7
+ from PIL import Image
8
+ import av
9
+ import numpy as np
10
+ import os
11
+
12
+ from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
13
+ from my_lib.preproces_video import read_video_pyav
14
+
15
+ app = FastAPI()
16
+
17
+ # Load model and processor
18
+ MODEL_ID = "llava-hf/LLaVA-NeXT-Video-7B-hf"
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+
21
+ print("Loading model and processor...")
22
+ processor = LlavaNextVideoProcessor.from_pretrained(MODEL_ID)
23
+
24
+ # Optional: Pre-cache model on HF Spaces to avoid redownloading
25
+ # from huggingface_hub import snapshot_download
26
+ # snapshot_download(MODEL_ID)
27
+
28
+ if device.type == "cuda":
29
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
30
+ MODEL_ID,
31
+ torch_dtype=torch.float16,
32
+ low_cpu_mem_usage=True,
33
+ load_in_4bit=True
34
+ ).to(device)
35
+ else:
36
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
37
+ MODEL_ID,
38
+ torch_dtype=torch.float32
39
+ ).to(device)
40
+
41
+ print(f"Model and processor loaded on {device}.")
42
+
43
+ @app.get("/")
44
+ async def root():
45
+ return {"message": "Welcome to the Summarization API. Use /summarize to summarize media files."}
46
+
47
+ @app.get("/health")
48
+ async def health():
49
+ return {"status": "ok", "device": device.type}
50
+
51
+ @app.post("/summarize")
52
+ async def summarize_media(file: UploadFile = File(...)):
53
+ try:
54
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file.filename) as tmp:
55
+ tmp.write(await file.read())
56
+ tmp_path = tmp.name
57
+
58
+ content_type = file.content_type
59
+ is_video = content_type.startswith("video/")
60
+ is_image = content_type.startswith("image/")
61
+
62
+ if not (is_video or is_image):
63
+ os.unlink(tmp_path)
64
+ return JSONResponse(status_code=400, content={"error": f"Unsupported file type: {content_type}"})
65
+
66
+ if is_video:
67
+ container = av.open(tmp_path)
68
+ total_frames = container.streams.video[0].frames or sum(1 for _ in container.decode(video=0))
69
+ container = av.open(tmp_path) # reopen to reset position
70
+
71
+ if total_frames == 0:
72
+ raise ValueError("Could not extract frames: total frame count is zero.")
73
+
74
+ num_frames = min(8, total_frames)
75
+ indices = np.linspace(0, total_frames - 1, num_frames).astype(int)
76
+ clip = read_video_pyav(container, indices)
77
+
78
+ conversation = [
79
+ {
80
+ "role": "user",
81
+ "content": [
82
+ {"type": "text", "text": "Summarize this video and explain the key highlights."},
83
+ {"type": "video"},
84
+ ],
85
+ },
86
+ ]
87
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
88
+ inputs = processor(text=prompt, videos=clip, return_tensors="pt").to(device)
89
+
90
+ elif is_image:
91
+ image = Image.open(tmp_path).convert("RGB")
92
+ conversation = [
93
+ {
94
+ "role": "user",
95
+ "content": [
96
+ {"type": "text", "text": "Describe the image and summarize its content."},
97
+ {"type": "image"},
98
+ ],
99
+ },
100
+ ]
101
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
102
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
103
+
104
+ output_ids = model.generate(**inputs, max_new_tokens=512)
105
+ response_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
106
+
107
+ return JSONResponse(content={"summary": response_text})
108
+
109
+ except Exception as e:
110
+ print("Unhandled error:", e)
111
+ print(traceback.format_exc())
112
+ return JSONResponse(status_code=500, content={"error": str(e)})
113
+
114
+ finally:
115
+ if 'tmp_path' in locals() and os.path.exists(tmp_path):
116
+ os.unlink(tmp_path)
my_lib/preproces_video.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import av
2
+ import numpy as np
3
+
4
+
5
+ def read_video_pyav(container, indices):
6
+ """
7
+ Decode selected frames from a video using PyAV.
8
+
9
+ Args:
10
+ container (av.container.input.InputContainer): The video container.
11
+ indices (List[int]): Indices of frames to extract.
12
+
13
+ Returns:
14
+ np.ndarray: Frames in shape (num_frames, height, width, 3)
15
+ """
16
+ frames = []
17
+ container.seek(0)
18
+ start_index = indices[0]
19
+ end_index = indices[-1]
20
+
21
+ for i, frame in enumerate(container.decode(video=0)):
22
+ if i > end_index:
23
+ break
24
+ if i >= start_index and i in indices:
25
+ frames.append(frame)
26
+
27
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>4.48.0
2
+ av
3
+ torch
4
+ torchvision
5
+ fastapi
6
+ unicorn[standard]
7
+ gunicorn
8
+ pillow
9
+ numpy
10
+ opencv-python-headless
11
+ bitsandbytes
12
+ accelerate