File size: 4,212 Bytes
ae0ae97 1b2c149 254e461 35edfe0 10e00e5 801db1d 254e461 801db1d 254e461 801db1d fd8d179 801db1d cb18289 d688d11 56a81ca a312e49 fd8d179 801db1d d688d11 fd8d179 d688d11 fd8d179 d688d11 cb18289 4cd5266 cb18289 4cd5266 cb18289 4cd5266 cb18289 10e00e5 4cd5266 cb18289 6ca0b8b 4cd5266 cb18289 56a81ca 535927b 6ca0b8b 4cd5266 cb18289 b66257a d688d11 036f146 a5a5f36 d688d11 b66257a d688d11 036f146 b5f0d2f 801db1d b66257a 4cd5266 801db1d b5f0d2f ae0ae97 b5f0d2f 036f146 a5a5f36 036f146 a5a5f36 036f146 4cd5266 cfef3c1 cb18289 56a81ca cb18289 b5f0d2f 5a003d2 d688d11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import streamlit as st
from PIL import Image
import numpy as np
import torch
from sklearn.utils.extmath import softmax
import open_clip
#from transformers import CLIPProcessor, CLIPModel
knnpath = '20241204-ams-no-env-open_clip_ViT-H-14-378-quickgelu.npz'
clip_model_name = 'ViT-H-14-378-quickgelu'
pretrained_name = 'dfn5b'
categories = ['walkability', 'bikeability', 'pleasantness', 'greenness', 'safety']
# Set page config
st.set_page_config(
page_title="Percept",
layout="wide"
)
debug = False
#st.write("Available models:", open_clip.list_models())
@st.cache_resource
def load_model():
"""Load the OpenCLIP model and return model and processor"""
model, _, preprocess = open_clip.create_model_and_transforms(
clip_model_name, pretrained=pretrained_name
)
tokenizer = open_clip.get_tokenizer(clip_model_name)
return model, preprocess, tokenizer
def process_image(image, preprocess):
"""Process image and return tensor"""
if isinstance(image, str):
# If image is a URL
response = requests.get(image)
image = Image.open(BytesIO(response.content))
# Ensure image is in RGB mode
if image.mode != 'RGB':
image = image.convert('RGB')
processed_image = preprocess(image).unsqueeze(0)
return processed_image
def knn_get_score(knn, k, cat, vec):
allvecs = knn[f'{cat}_vecs']
if debug: st.write('allvecs.shape', allvecs.shape)
scores = knn[f'{cat}_scores']
if debug: st.write('scores.shape', scores.shape)
# Compute cosine similiarity of vec against allvecs
# (both are already normalized)
cos_sim_table = vec @ allvecs.T
if debug: st.write('cos_sim_table.shape', cos_sim_table.shape)
# Get sorted array indices by similiarity in descending order
sortinds = np.flip(np.argsort(cos_sim_table, axis=1), axis=1)
if debug: st.write('sortinds.shape', sortinds.shape)
# Get corresponding scores for the sorted vectors
kscores = scores[sortinds][:,:k]
if debug: st.write('kscores.shape', kscores.shape)
# Get actual sorted similiarity scores
# (line copied from clip_retrieval_knn.py even though sortinds.shape[0] == 1 here)
ksims = cos_sim_table[np.expand_dims(np.arange(sortinds.shape[0]), axis=1), sortinds]
ksims = ksims[:,:k]
if debug: st.write('ksims.shape', ksims.shape)
# Apply normalization after exponential formula
ksims = softmax(10**ksims)
# Weighted sum
kweightedscore = np.sum(kscores * ksims)
return kweightedscore
@st.cache_resource
def load_knn():
return np.load(knnpath)
def main():
st.title("Percept: Human Perception of Street View Image Analyzer")
try:
with st.spinner('Loading CLIP model... This may take a moment.'):
model, preprocess, tokenizer = load_model()
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
except Exception as e:
st.error(f"Error loading model: {str(e)}")
st.info("Please make sure you have enough memory and the correct dependencies installed.")
with st.spinner('Loading KNN model... This may take a moment.'):
knn = load_knn()
if debug: st.write(knn['walkability_vecs'].shape)
file = st.file_uploader('Upload An Image')
if file:
try:
image = Image.open(file)
st.image(image, caption="Uploaded Image", width=400)
# Process image
with st.spinner('Processing image...'):
processed_image = process_image(image, preprocess)
processed_image = processed_image.to(device)
# Encode into CLIP vector
with torch.no_grad():
vec = model.encode_image(processed_image)
# Normalize vector
vec /= vec.norm(dim=-1, keepdim=True)
if debug: st.write(vec.shape)
vec = vec.numpy()
k = 40
for cat in categories:
st.write(cat, f'rating = {knn_get_score(knn, k, cat, vec):.1f}')
except Exception as e:
st.error(f"Error processing image: {str(e)}")
if __name__ == "__main__":
main()
|