"""HTML parser. Contains parser for html files. """ import re from pathlib import Path from typing import Dict, Union from abc import abstractmethod from pathlib import Path from typing import Dict, List, Optional, Union class BaseParser: """Base class for all parsers.""" def __init__(self, parser_config: Optional[Dict] = None): """Init params.""" self._parser_config = parser_config def init_parser(self) -> None: """Init parser and store it.""" parser_config = self._init_parser() self._parser_config = parser_config @property def parser_config_set(self) -> bool: """Check if parser config is set.""" return self._parser_config is not None @property def parser_config(self) -> Dict: """Check if parser config is set.""" if self._parser_config is None: raise ValueError("Parser config not set.") return self._parser_config @abstractmethod def _init_parser(self) -> Dict: """Initialize the parser with the config.""" @abstractmethod def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file.""" class HTMLParser(BaseParser): """HTML parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: """Parse file. Returns: Union[str, List[str]]: a string or a List of strings. """ try: from unstructured.partition.html import partition_html from unstructured.staging.base import convert_to_isd from unstructured.cleaners.core import clean except ImportError: raise ValueError("unstructured package is required to parse HTML files.") # Using the unstructured library to convert the html to isd format # isd sample : isd = [ # {"text": "My Title", "type": "Title"}, # {"text": "My Narrative", "type": "NarrativeText"} # ] with open(file, "r", encoding="utf-8") as fp: elements = partition_html(file=fp) isd = convert_to_isd(elements) # Removing non ascii charactwers from isd_el['text'] for isd_el in isd: isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() # Removing all the \n characters from isd_el['text'] using regex and replace with single space # Removing all the extra spaces from isd_el['text'] using regex and replace with single space for isd_el in isd: isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL) isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL) # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation for isd_el in isd: clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True) # Creating a list of all the indexes of isd_el['type'] = 'Title' title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title'] # Creating 'Chunks' - List of lists of strings # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' # Each Chunk can be thought of as an individual set of data, which can be sent to the model # Where Each Title is grouped together with the data under it Chunks = [[]] final_chunks = list(list()) for i, isd_el in enumerate(isd): if i in title_indexes: Chunks.append([]) Chunks[-1].append(isd_el['text']) # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 # TODO: This value can be an user defined variable for chunk in Chunks: # sum of lenth of all the strings in the chunk sum = 0 sum += len(str(chunk)) if sum < 25: Chunks.remove(chunk) else: # appending all the approved chunks to final_chunks as a single string final_chunks.append(" ".join([str(item) for item in chunk])) return final_chunks