Spaces:

IrisDeng
/

image-to-text_test-to-speech

Running

App Files Files Community

IrisDeng commited on 16 days ago

Commit

2cd1bd6

verified ·

1 Parent(s): cfffd7a

Create app.py

Browse files

Files changed (1) hide show

app.py +46 -0

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Dict, List, Any
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from PIL import Image
+from io import BytesIO
+import torch, re, base64
+class EndpointHandler:
+    def __init__(self, path=""):
+        # load the optimized model
+        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")
+    def __call__(self, data: Any) -> Dict[str, Any]:
+        """
+        Args:
+            data (:obj:):
+                includes the input data and the parameters for the inference.
+        Return:
+            A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
+                - "caption": A string corresponding to the generated caption.
+        """
+        # parameters = data.pop("parameters", {})
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        inputs = base64.b64decode(re.sub('^data:image/.+;base64,', '', data['inputs']))
+        raw_images = Image.open(BytesIO(inputs))
+        processed_image = self.processor(images=raw_images, return_tensors="pt").to(device)
+        out = self.model.generate(**processed_image)
+        captions = self.processor.decode(out[0], skip_special_tokens=True)
+        # postprocess the prediction
+        return {"captions": captions}
+EndpointHandler()