File size: 1,272 Bytes
3724ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# %%


import textwrap

from benedict import benedict
from langchain_core.documents import Document


def _fixed_width_wrap(text, width: int = 70, join_str: str = "\n"):
    return join_str.join(textwrap.wrap(text, width=width))


def fixed_width_wrap(text, width: int = 70, join_str: str = "\n", split_str="\n"):
    return join_str.join(
        [
            _fixed_width_wrap(t, width=width, join_str=join_str)
            for t in text.split(split_str)
        ]
    )


def format_doc_minimal(d, fixed_width=False):
    if isinstance(d, Document):
        _cont = d.page_content
        _meta = benedict(d.metadata)
    else:
        _cont = d["page_content"]
        _meta = benedict(d["metadata"])

    if fixed_width:
        _cont = _fixed_width_wrap(_cont)

    return """\
Title:\t{title}
Published on:\t{pubdate}
Source:\t{source_name} ({source_country})
Chunk Content:

\t{cont}
""".format(
        d=d,
        title=_meta.get("title"),
        pubdate=_meta.get("pubdate"),
        source_name=_meta.get("source.host") or _meta.get("source.id"),
        source_country=_meta.get("source.country", "n/a"),
        cont=_cont,
    )


def format_docs(docs, doc_fn=format_doc_minimal, **kwargs):
    return "\n---\n".join([doc_fn(d, **kwargs) for d in docs])