Spaces:
Sleeping
Sleeping
import re | |
import pdfplumber | |
# remove not required things and clean the text | |
def clean_text(text): | |
# Remove HTML tags | |
text = re.sub(r'<[^>]*?>', '', text) | |
# Remove URLs | |
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) | |
# Remove special characters | |
text = re.sub(r'[^a-zA-Z0-9 ]', '', text) | |
# Replace multiple spaces with a single space | |
text = re.sub(r'\s{2,}', ' ', text) | |
# Trim leading and trailing whitespace | |
text = text.strip() | |
# Remove extra whitespace | |
text = ' '.join(text.split()) | |
return text | |
def extract_text_from_pdf(uploaded_file): | |
if uploaded_file is not None: | |
with pdfplumber.open(uploaded_file) as pdf: | |
pages = [page.extract_text() for page in pdf.pages] | |
return "\n".join(pages) if pages else "" | |
return "" |