Spaces:
Running
Running
| # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| CrawlUtils is a class that provides utility methods for web crawling and processing. | |
| """ | |
| import logging | |
| import re | |
| from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig, DefaultMarkdownGenerator, PruningContentFilter | |
| class CrawlUtils(object): | |
| """ | |
| Provides web crawling and content extraction utilities with intelligent filtering. | |
| Features include asynchronous crawling, content pruning, markdown generation, | |
| and configurable link/media filtering. Uses crawl4ai library for core functionality. | |
| """ | |
| def __init__(self): | |
| """Initialize the CrawlUtils instance.""" | |
| self.logger = logging.getLogger(__name__) | |
| # Configure content filter - uses pruning algorithm to filter page content | |
| content_filter = PruningContentFilter( | |
| threshold=0.48, | |
| threshold_type="fixed" | |
| ) | |
| # Configure markdown generator, apply the above content filter to generate "fit_markdown" | |
| md_generator = DefaultMarkdownGenerator( | |
| content_filter=content_filter | |
| ) | |
| # Configure crawler run parameters | |
| self.run_config = CrawlerRunConfig( | |
| # 20 seconds page timeout | |
| page_timeout=20000, | |
| # Filtering | |
| word_count_threshold=10, | |
| excluded_tags=["nav", "footer", "aside", "header", "script", "style", "iframe", "meta"], | |
| exclude_external_links=True, | |
| exclude_internal_links=True, | |
| exclude_social_media_links=True, | |
| exclude_external_images=True, | |
| only_text=True, | |
| # Markdown generation | |
| markdown_generator=md_generator, | |
| # Cache | |
| cache_mode=CacheMode.BYPASS | |
| ) | |
| async def get_webpage_text(self, url: str) -> str: | |
| """ | |
| Asynchronously fetches and cleans webpage content from given URL using configured crawler. | |
| Applies content filtering, markdown conversion, and text cleaning (removing undefined, | |
| excess whitespace, tabs). Returns None if error occurs. | |
| Args: | |
| url (str): The URL to retrieve the text from. | |
| Returns: | |
| str: The plain text retrieved from the specified URL. | |
| """ | |
| try: | |
| async with AsyncWebCrawler() as crawler: | |
| result = await crawler.arun( | |
| url=url, | |
| config=self.run_config | |
| ) | |
| webpage_text = result.markdown.fit_markdown | |
| self.logger.info("Webpage Text: \n{}".format(webpage_text)) | |
| # Clean up the text | |
| cleaned_text = webpage_text.replace("undefined", "") | |
| cleaned_text = re.sub(r'(\n\s*){3,}', '\n\n', cleaned_text) | |
| cleaned_text = re.sub(r'[\r\t]', '', cleaned_text) | |
| cleaned_text = re.sub(r' +', ' ', cleaned_text) | |
| cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE) | |
| return cleaned_text.strip() | |
| except Exception as e: | |
| self.logger.info("Error: {}".format(e)) | |
| return None |