File size: 12,189 Bytes
f724766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import streamlit as st
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import pytesseract
from PIL import Image
import pdfplumber
import docx
from io import BytesIO
import logging
from docx import Document
from fpdf import FPDF

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Initialize LLM
llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")

# OCR Configuration for Pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path

# Enhanced OCR with configurable language option and multi-image support
def extract_text_from_images(images, lang="eng"):
    ocr_text = ""
    for image in images:
        try:
            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
        except Exception as e:
            logging.error(f"Error in OCR: {e}")
    return ocr_text.strip()

# Function to extract text, images, tables, and formulas from PDF
def extract_pdf_data(pdf_path):
    data = {"text": "", "tables": [], "images": []}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                data["text"] += page.extract_text() or ""
                tables = page.extract_tables()
                for table in tables:
                    data["tables"].append(table)
                for image in page.images:
                    base_image = pdf.extract_image(image["object_number"])
                    image_obj = Image.open(BytesIO(base_image["image"]))
                    data["images"].append(image_obj)
    except Exception as e:
        logging.error(f"Error processing PDF: {e}")
    return data

# Function to extract text from DOCX files
def extract_docx_data(docx_file):
    try:
        doc = docx.Document(docx_file)
        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
        return text
    except Exception as e:
        logging.error(f"Error extracting DOCX content: {e}")
        return ""

# Function to extract text from plain text files
def extract_text_file_data(text_file):
    try:
        return text_file.read().decode("utf-8").strip()
    except Exception as e:
        logging.error(f"Error extracting TXT content: {e}")
        return ""

# Function to process extracted content (PDF, DOCX, etc.)
def process_content(file_data, file_type, lang="eng"):
    text = ""
    images = []
    if file_type == "pdf":
        pdf_data = extract_pdf_data(file_data)
        text = process_pdf_content(pdf_data)
        images = pdf_data["images"]
    elif file_type == "docx":
        text = extract_docx_data(file_data)
    elif file_type == "txt":
        text = extract_text_file_data(file_data)
    elif file_type in ["png", "jpg", "jpeg"]:
        image = Image.open(file_data)
        images.append(image)

    ocr_text = extract_text_from_images(images, lang)
    return text + "\n" + ocr_text

# Function to process PDF content
def process_pdf_content(pdf_data):
    ocr_text = extract_text_from_images(pdf_data["images"])
    combined_text = pdf_data["text"] + ocr_text

    table_text = ""
    for table in pdf_data["tables"]:
        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
        table_text += "\n".join(table_rows) + "\n"

    return (combined_text + "\n" + table_text).strip()

# Function to generate questions
def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
    prompt_template = f"""
    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
    Subject: {subject_name}
    Instructor: {instructor}
    Class: {class_name}
    Institution: {institution}
    Syllabus Content: {syllabus_context}
    Difficulty Levels:
    - Remember: {difficulty_level.get('Remember', 0)}
    - Understand: {difficulty_level.get('Understand', 0)}
    - Apply: {difficulty_level.get('Apply', 0)}
    - Analyze: {difficulty_level.get('Analyze', 0)}
    - Evaluate: {difficulty_level.get('Evaluate', 0)}
    - Create: {difficulty_level.get('Create', 0)}
    Format questions as follows:
    Q1. ________________
    Q2. ________________
    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating {question_type} questions: {e}")
        return ""

# Function to generate answers
def generate_answers(questions, syllabus_context):
    prompt = f"""
    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
    Syllabus Content: {syllabus_context}
    Questions:
    {questions}
    Format answers as follows:
    Answer 1: ________________
    Answer 2: ________________
    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating answers: {e}")
        return ""

# Function to download as DOCX
def download_as_docx(content, file_name="output.docx"):
    doc = Document()
    for line in content.split("\n"):
        doc.add_paragraph(line)
    buffer = BytesIO()
    doc.save(buffer)
    buffer.seek(0)
    return buffer

# Function to download as PDF
def download_as_pdf(content, file_name="output.pdf"):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in content.split("\n"):
        pdf.cell(200, 10, txt=line, ln=True)
    buffer = BytesIO()
    pdf.output(buffer)
    buffer.seek(0)
    return buffer

# Streamlit app with enhanced UI and multi-image upload support
st.title("Bloom's Taxonomy Based Exam Paper Developer")
st.markdown("""
### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles.
""")

# Sidebar Clear Data Button
if st.sidebar.button("Clear All Data"):
    st.session_state.clear()
    st.success("All data has been cleared. You can now upload a new syllabus.")

# Upload Syllabus and Multiple Images
uploaded_file = st.sidebar.file_uploader(
    "Upload Syllabus (PDF, DOCX, TXT)",
    type=["pdf", "docx", "txt"]
)

uploaded_images = st.sidebar.file_uploader(
    "Upload Supplementary Images (PNG, JPG, JPEG)", 
    type=["png", "jpg", "jpeg"], 
    accept_multiple_files=True
)

# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")

# Language Option for OCR
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])

# Process uploaded file and images
if uploaded_file or uploaded_images:
    # Clear session state when new files are uploaded
    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
        st.session_state.clear()
        st.success("Previous data cleared. Processing new file...")

    st.session_state.uploaded_filename = uploaded_file.name if uploaded_file else None

    # Process syllabus file
    if uploaded_file:
        file_type = uploaded_file.type.split("/")[-1]
        if file_type in ["pdf", "docx", "txt"]:
            syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
            st.session_state.syllabus_text = syllabus_text
        else:
            st.error("Unsupported file type. Please upload PDF, DOCX, or TXT files.")

    # Process images
    if uploaded_images:
        image_text = extract_text_from_images([Image.open(img) for img in uploaded_images], lang=ocr_lang)
        st.session_state.syllabus_text = st.session_state.get("syllabus_text", "") + "\n" + image_text

# Preview of Syllabus
if "syllabus_text" in st.session_state:
    st.markdown("### Preview of Extracted Syllabus Content")
    st.text_area("Extracted Syllabus Content", st.session_state.syllabus_text, height=300)

# Inputs for Question Generation
if "syllabus_text" in st.session_state:
    st.markdown("### Generate Questions")
    question_type = st.selectbox("Select Question Type", ["Multiple Choice", "Short Answer", "Essay"])
    num_questions = st.number_input("Number of Questions", min_value=1, max_value=50, value=10)
    difficulty_levels = {
        "Remember": st.slider("Remember (%)", 0, 100, 20),
        "Understand": st.slider("Understand (%)", 0, 100, 20),
        "Apply": st.slider("Apply (%)", 0, 100, 20),
        "Analyze": st.slider("Analyze (%)", 0, 100, 20),
        "Evaluate": st.slider("Evaluate (%)", 0, 100, 10),
        "Create": st.slider("Create (%)", 0, 100, 10),
    }

    if st.button("Generate Questions"):
        with st.spinner("Generating questions..."):
            questions = generate_questions(
                question_type,
                subject_name,
                instructor_name,
                class_name,
                institution_name,
                st.session_state.syllabus_text,
                num_questions,
                difficulty_levels,
            )
            st.session_state.generated_questions = questions
            st.success("Questions generated successfully!")

# Display Generated Questions
if "generated_questions" in st.session_state:
    st.markdown("### Generated Questions")
    st.text_area("Questions", st.session_state.generated_questions, height=300)

    if st.button("Generate Answers"):
        with st.spinner("Generating answers..."):
            answers = generate_answers(
                st.session_state.generated_questions,
                st.session_state.syllabus_text,
            )
            st.session_state.generated_answers = answers
            st.success("Answers generated successfully!")

# Display Generated Answers
if "generated_answers" in st.session_state:
    st.markdown("### Generated Answers")
    st.text_area("Answers", st.session_state.generated_answers, height=300)

# Download Options
if "generated_questions" in st.session_state or "generated_answers" in st.session_state:
    st.markdown("### Download Options")
    download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"])

    content_to_download = ""
    if "generated_questions" in st.session_state:
        content_to_download += "Generated Questions:\n" + st.session_state.generated_questions + "\n\n"
    if "generated_answers" in st.session_state:
        content_to_download += "Generated Answers:\n" + st.session_state.generated_answers

    if st.button("Download"):
        if download_choice == "DOCX":
            buffer = download_as_docx(content_to_download)
            st.download_button(
                label="Download as DOCX",
                data=buffer,
                file_name="exam_content.docx",
                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            )
        elif download_choice == "PDF":
            buffer = download_as_pdf(content_to_download)
            st.download_button(
                label="Download as PDF",
                data=buffer,
                file_name="exam_content.pdf",
                mime="application/pdf",
            )
        elif download_choice == "TXT":
            buffer = BytesIO(content_to_download.encode("utf-8"))
            st.download_button(
                label="Download as TXT",
                data=buffer,
                file_name="exam_content.txt",
                mime="text/plain",
            )