flashcard-studio / app /processing.py
Nathan Slaughter
cleanup app
b8d2f65
import os
import pymupdf4llm
from .models import parse_message
from .pipeline import Pipeline
def process_pdf(pdf_path: str) -> str:
"""Extracts text from a PDF file using pymupdf4llm."""
try:
text = pymupdf4llm.to_markdown(pdf_path)
return text
except Exception as e:
raise ValueError(f"Error processing PDF: {str(e)}")
def read_text_file(file_path: str) -> str:
"""Reads text from a .txt or .md file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
except Exception as e:
raise ValueError(f"Error reading text file: {str(e)}")
def process_file(file_obj, output_format: str, pipeline) -> str:
"""Processes the uploaded file based on its type and extracts flashcards."""
file_path = file_obj.name
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
text = process_pdf(file_path)
elif file_ext in ['.txt', '.md']:
text = read_text_file(file_path)
else:
raise ValueError("Unsupported file type.")
flashcards = generate_flashcards(output_format, text)
return flashcards
def reduce_newlines(text: str) -> str:
"""Reduces consecutive newlines exceeding 2 to just 2."""
while "\n\n\n" in text:
text = text.replace("\n\n\n", "\n\n")
return text
def generate_flashcards(output_format: str, content: str) -> str:
"""
Generates flashcards from the content.
"""
content = reduce_newlines(content)
response = Pipeline().extract_flashcards(content)
return format_flashcards(output_format, response)
def process_text_input(input_text: str, output_format: str = "csv") -> str:
"""Processes the input text and extracts flashcards."""
if not input_text.strip():
raise ValueError("No text provided.")
pipeline = Pipeline()
flashcards = generate_flashcards(output_format, input_text)
return flashcards
def format_flashcards(output_format: str, response: str) -> str:
"""Formats the response into the desired output format."""
output = ""
try :
message = parse_message(response)
except Exception as e:
raise e
if output_format.lower() == "json":
output:str = message.content_to_json()
elif output_format.lower() == "csv":
output = message.content_to_csv()
return output