Spaces:

mikeee
/

mlbee

Sleeping

File size: 2,417 Bytes

2c2081e
de6562c
2c2081e

"""Fetch text from url."""
# pylint: disable=too-many-branches
from typing import Optional
from urllib.parse import urlparse

import html2text
import httpx
import streamlit as st
from logzero import logger
from readability import Document


# @st.cache
def url2txt(
    url: str,
    bodywidth: Optional[int] = 5000,
    remove: bool = False,
    show_url: bool = True,
    ignore_links: bool = True,
) -> str:
    """Fetch text from url.

    Args:
        url: netloc from which to fetch text
        bodywidth: if set to None, fall back to default bodywidth of
            html2text.HTML2Text
        remove: remove blank lines if set to True
        show_url: prepend url if set to True
        ignore_links: remove [ur](url)

    Return:
        main body in text

    bodywidth: Optional[int] = 5000
    remove: bool = False
    show_url: bool = True
    ignore_links: bool = True
    """
    url = url.strip()
    if not url.startswith("http"):
        url = "http://" + url

    logger.info("url: %s", url)

    parsed = urlparse(url)
    if not parsed.scheme or not parsed.netloc:  # no scheme or netloc present
        raise Exception(f"Invalid url: {url}")

    try:
        resp = httpx.get(url, timeout=30)
        resp.raise_for_status()
    except Exception as exc:
        logger.error(exc)
        raise

    try:
        content_type = resp.headers["content-type"]
    except Exception as e:
        logger.error(e)
        content_type = ""
    # output text if text/plain
    if "text/plain" in content_type:
        return resp.text

    # handle html and the rest
    try:
        doc = Document(resp.text)
    except Exception as exc:
        logger.error(exc)
        raise

    if not doc.summary().strip():
        raise Exception("No content for some reason...")

    if bodywidth is not None:
        handle = html2text.HTML2Text(bodywidth=bodywidth)
    else:
        handle = html2text.HTML2Text()

    handle.ignore_links = ignore_links

    try:
        res = handle.handle(doc.summary())
    except Exception as exc:
        logger.error(exc)
        raise

    # remove double blank lines
    if remove:
        res = "\n".join(elm for elm in res.splitlines() if elm.strip())

    if not res.strip():  # warn if empty output
        logger.warning("Output seems to be empty...")

    if show_url:
        return f"{url}\n# {doc.title()}\n{res}"

    return f"# {doc.title()}\n{res}"