Spaces:
Sleeping
Sleeping
import os | |
import pymupdf4llm | |
from .models import parse_message | |
from .pipeline import Pipeline | |
def process_pdf(pdf_path: str) -> str: | |
"""Extracts text from a PDF file using pymupdf4llm.""" | |
try: | |
text = pymupdf4llm.to_markdown(pdf_path) | |
return text | |
except Exception as e: | |
raise ValueError(f"Error processing PDF: {str(e)}") | |
def read_text_file(file_path: str) -> str: | |
"""Reads text from a .txt or .md file.""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
text = f.read() | |
return text | |
except Exception as e: | |
raise ValueError(f"Error reading text file: {str(e)}") | |
def process_file(file_obj, output_format: str, pipeline) -> str: | |
"""Processes the uploaded file based on its type and extracts flashcards.""" | |
file_path = file_obj.name | |
file_ext = os.path.splitext(file_path)[1].lower() | |
if file_ext == '.pdf': | |
text = process_pdf(file_path) | |
elif file_ext in ['.txt', '.md']: | |
text = read_text_file(file_path) | |
else: | |
raise ValueError("Unsupported file type.") | |
flashcards = generate_flashcards(output_format, text) | |
return flashcards | |
def reduce_newlines(text: str) -> str: | |
"""Reduces consecutive newlines exceeding 2 to just 2.""" | |
while "\n\n\n" in text: | |
text = text.replace("\n\n\n", "\n\n") | |
return text | |
def generate_flashcards(output_format: str, content: str) -> str: | |
""" | |
Generates flashcards from the content. | |
""" | |
content = reduce_newlines(content) | |
response = Pipeline().extract_flashcards(content) | |
return format_flashcards(output_format, response) | |
def process_text_input(input_text: str, output_format: str = "csv") -> str: | |
"""Processes the input text and extracts flashcards.""" | |
if not input_text.strip(): | |
raise ValueError("No text provided.") | |
pipeline = Pipeline() | |
flashcards = generate_flashcards(output_format, input_text) | |
return flashcards | |
def format_flashcards(output_format: str, response: str) -> str: | |
"""Formats the response into the desired output format.""" | |
output = "" | |
try : | |
message = parse_message(response) | |
except Exception as e: | |
raise e | |
if output_format.lower() == "json": | |
output:str = message.content_to_json() | |
elif output_format.lower() == "csv": | |
output = message.content_to_csv() | |
return output | |