Spaces:
Runtime error
Runtime error
| import torch | |
| import streamlit as st | |
| from PIL import Image | |
| from io import BytesIO | |
| from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor | |
| def run_prediction(sample): | |
| global pretrained_model, processor, task_prompt | |
| if isinstance(sample, dict): | |
| # prepare inputs | |
| pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0) | |
| else: # sample is an image | |
| # prepare encoder inputs | |
| pixel_values = processor(image, return_tensors="pt").pixel_values | |
| decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
| # run inference | |
| outputs = pretrained_model.generate( | |
| pixel_values.to(device), | |
| decoder_input_ids=decoder_input_ids.to(device), | |
| max_length=pretrained_model.decoder.config.max_position_embeddings, | |
| early_stopping=True, | |
| pad_token_id=processor.tokenizer.pad_token_id, | |
| eos_token_id=processor.tokenizer.eos_token_id, | |
| use_cache=True, | |
| num_beams=1, | |
| bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| return_dict_in_generate=True, | |
| ) | |
| # process output | |
| prediction = processor.batch_decode(outputs.sequences)[0] | |
| # post-processing | |
| if "cord" in task_prompt: | |
| prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
| # prediction = re.sub(r"<.*?>", "", prediction, count=1).strip() # remove first task start token | |
| prediction = processor.token2json(prediction) | |
| # load reference target | |
| if isinstance(sample, dict): | |
| target = processor.token2json(sample["target_sequence"]) | |
| else: | |
| target = "<not_provided>" | |
| return prediction, target | |
| task_prompt = f"<s>" | |
| # logo = Image.open("./img/rsz_unstructured_logo.png") | |
| # st.image(logo) | |
| st.markdown(''' | |
| ### Donut Common Crawl | |
| Experimental OCR-free Document Understanding Vision Transformer nicknamed π©, fine-tuned with few samples of the common-crawl with some specific document elements. | |
| ''') | |
| with st.sidebar: | |
| information = st.radio( | |
| "Choose one predictor:?", | |
| ('Base Common-Crawl π©', 'Hierarchical Common-Crawl π©')) | |
| image_choice = st.selectbox('Pick one π', ['1', '2', '3'], index=1) | |
| st.text(f'{information} mode is ON!\nTarget π: {image_choice}') # \n(opening image @:./img/receipt-{receipt}.png)') | |
| col1, col2 = st.columns(2) | |
| image_choice_map = { | |
| '1': 'commoncrawl_amandalacombznewspolice-bust-man-sawed-oal_1.jpg', | |
| '2': 'commoncrawl_canyonhillschroniclecomtagwomens-basketbll_0.png', | |
| '3': 'commoncrawl_celstuttgartdeideaa-different-stort-of-nfe_0.png' | |
| } | |
| image = Image.open(f'samples/{image_choice_map[image_choice]}') | |
| with col1: | |
| st.image(image, caption='Your target sample') | |
| if st.button('Parse sample! π'): | |
| image = image.convert('RGB') | |
| image.save('./target_image.jpg') | |
| image = Image.open('./target_image.jpg') | |
| with st.spinner(f'baking the π©s...'): | |
| if information == 'Base Common-Crawl π©': | |
| processor = DonutProcessor.from_pretrained("laverdes/donut-web") # laverdes/donut-commoncrawl | |
| pretrained_model = VisionEncoderDecoderModel.from_pretrained("laverdes/donut-web") # laverdes/donut-commoncrawl | |
| task_prompt = f"<s>" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| pretrained_model.to(device) | |
| elif information == 'Hierarchical Common-Crawl π©': | |
| st.info("Not implemented yet...") | |
| with col2: | |
| st.info(f'parsing π...') | |
| parsed_info, _ = run_prediction(image) | |
| st.text(f'\n{information}') | |
| st.json(parsed_info) |