| """ | |
| Textractor module | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from txtai.pipeline.segmentation import Segmentation | |
| class Textractor(Segmentation): | |
| """ | |
| Extracts text from files. | |
| """ | |
| def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False): | |
| super().__init__(sentences, lines, paragraphs, minlength, join) | |
| def text(self, text): | |
| # text is a url | |
| response = requests.get(text) | |
| html = response.text | |
| soup = BeautifulSoup(html, features="html.parser") | |
| return soup.get_text() | |