import transformers import gradio as gr from transformers import pipeline, DonutProcessor, VisionEncoderDecoderModel, AutoTokenizer, AutoModelForSequenceClassification, LayoutLMv3Processor, LayoutLMv3ForTokenClassification from PIL import Image import torch import speech_recognition as sr from pydub import AudioSegment import os import re processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2",force_download=True) model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") task_prompt = "" decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"] # Image Classification Model image_classifier = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") # Sentiment Analysis Model sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment") # Text Categorization Model tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") nli_model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli") nli_pipeline = pipeline("zero-shot-classification", model=nli_model, tokenizer=tokenizer) # Function for Image Recognition def image_recognition(image): try: result = image_classifier(image) output = "

Image Details:

" return output except Exception as e: return f"Error in Image Recognition: {str(e)}" # Function to extract text from an image using Donut def extract_text_from_image(input_img): try: pixel_values = processor(input_img, return_tensors="pt").pixel_values device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) outputs = model.generate(pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device), max_length=model.decoder.config.max_position_embeddings, early_stopping=True, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=1, bad_words_ids=[[processor.tokenizer.unk_token_id]], return_dict_in_generate=True, output_scores=True,) sequence = processor.batch_decode(outputs.sequences)[0] sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token final=processor.token2json(sequence) return final except Exception as e: return {"error": str(e)} # Function for Sentiment Analysis def analyze_sentiment(feedback_text): try: sentiment_result = sentiment_pipeline(feedback_text) output = "

Feedback Sentiment Analysis:

" return output except Exception as e: return f"Error in Sentiment Analysis: {str(e)}" # Function for Text Categorization def categorize_complaint(complaint_text): try: labels = ["coach cleanliness", "damage", "staff behavior", "safety", "delay", "other"] result = nli_pipeline(complaint_text, candidate_labels=labels) output = f"

Complaint Categorization:

Text: {result['sequence']}

" return output except Exception as e: return f"Error in Complaint Categorization: {str(e)}" # Function to Process Voice Input def process_audio(audio): recognizer = sr.Recognizer() audio_file = audio # The file path from Gradio # Convert audio to required format for processing try: sound = AudioSegment.from_file(audio_file) sound.export("temp.wav", format="wav") except Exception as e: return f"Audio processing error: {e}" with sr.AudioFile("temp.wav") as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data) os.remove("temp.wav") # Clean up temporary file return f"

Transcribed Audio:

{text}

" except sr.UnknownValueError: os.remove("temp.wav") # Clean up temporary file return "Could not understand the audio." except sr.RequestError as e: os.remove("temp.wav") # Clean up temporary file return f"Could not request results: {e}" # Gradio Interface Components def main(image, complaint_text, feedback_text, audio): # Process Image image_results = image_recognition(image) if image else "No image provided." # Process OCR Text ocr_text = extract_text_from_image(image) if image else "No image provided." # Process Complaint Categorization categorized_complaint = categorize_complaint(complaint_text) if complaint_text else "No complaint text provided." # Process Sentiment Analysis sentiment_result = analyze_sentiment(feedback_text) if feedback_text else "No feedback text provided." # Process Audio Input audio_text = process_audio(audio) if audio else "No audio provided." return f"{image_results}
{ocr_text}
{categorized_complaint}
{sentiment_result}
{audio_text}" # Build Gradio UI iface = gr.Interface( fn=main, inputs=[ gr.Image(type="pil", label="Upload Complaint Image"), gr.Textbox(lines=5, placeholder="Enter Complaint Text", label="Complaint Text"), gr.Textbox(lines=2, placeholder="Enter Feedback Text", label="Feedback Text"), gr.Audio(type="filepath", label="Upload Audio Complaint") # Use 'filepath' for audio input ], outputs=[ gr.HTML(label="Results") # Changed to HTML for more customization ], title="Rail Madad Complaint Resolution System", description="AI-powered system for automated categorization, prioritization, and response to complaints on Rail Madad." ) iface.launch() img=Image.open("/content/Tech Mahindra hiring process.png") image_classifier(img)[0]["generated_text"]