Spaces:
Sleeping
Sleeping
File size: 2,417 Bytes
2c2081e de6562c 2c2081e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
"""Fetch text from url."""
# pylint: disable=too-many-branches
from typing import Optional
from urllib.parse import urlparse
import html2text
import httpx
import streamlit as st
from logzero import logger
from readability import Document
# @st.cache
def url2txt(
url: str,
bodywidth: Optional[int] = 5000,
remove: bool = False,
show_url: bool = True,
ignore_links: bool = True,
) -> str:
"""Fetch text from url.
Args:
url: netloc from which to fetch text
bodywidth: if set to None, fall back to default bodywidth of
html2text.HTML2Text
remove: remove blank lines if set to True
show_url: prepend url if set to True
ignore_links: remove [ur](url)
Return:
main body in text
bodywidth: Optional[int] = 5000
remove: bool = False
show_url: bool = True
ignore_links: bool = True
"""
url = url.strip()
if not url.startswith("http"):
url = "http://" + url
logger.info("url: %s", url)
parsed = urlparse(url)
if not parsed.scheme or not parsed.netloc: # no scheme or netloc present
raise Exception(f"Invalid url: {url}")
try:
resp = httpx.get(url, timeout=30)
resp.raise_for_status()
except Exception as exc:
logger.error(exc)
raise
try:
content_type = resp.headers["content-type"]
except Exception as e:
logger.error(e)
content_type = ""
# output text if text/plain
if "text/plain" in content_type:
return resp.text
# handle html and the rest
try:
doc = Document(resp.text)
except Exception as exc:
logger.error(exc)
raise
if not doc.summary().strip():
raise Exception("No content for some reason...")
if bodywidth is not None:
handle = html2text.HTML2Text(bodywidth=bodywidth)
else:
handle = html2text.HTML2Text()
handle.ignore_links = ignore_links
try:
res = handle.handle(doc.summary())
except Exception as exc:
logger.error(exc)
raise
# remove double blank lines
if remove:
res = "\n".join(elm for elm in res.splitlines() if elm.strip())
if not res.strip(): # warn if empty output
logger.warning("Output seems to be empty...")
if show_url:
return f"{url}\n# {doc.title()}\n{res}"
return f"# {doc.title()}\n{res}"
|