crisesStorylinesRAG / client_v1 /formatting_utils.py
jattokatarratto's picture
Upload folder using huggingface_hub
3724ac8 verified
raw
history blame
1.27 kB
# %%
import textwrap
from benedict import benedict
from langchain_core.documents import Document
def _fixed_width_wrap(text, width: int = 70, join_str: str = "\n"):
return join_str.join(textwrap.wrap(text, width=width))
def fixed_width_wrap(text, width: int = 70, join_str: str = "\n", split_str="\n"):
return join_str.join(
[
_fixed_width_wrap(t, width=width, join_str=join_str)
for t in text.split(split_str)
]
)
def format_doc_minimal(d, fixed_width=False):
if isinstance(d, Document):
_cont = d.page_content
_meta = benedict(d.metadata)
else:
_cont = d["page_content"]
_meta = benedict(d["metadata"])
if fixed_width:
_cont = _fixed_width_wrap(_cont)
return """\
Title:\t{title}
Published on:\t{pubdate}
Source:\t{source_name} ({source_country})
Chunk Content:
\t{cont}
""".format(
d=d,
title=_meta.get("title"),
pubdate=_meta.get("pubdate"),
source_name=_meta.get("source.host") or _meta.get("source.id"),
source_country=_meta.get("source.country", "n/a"),
cont=_cont,
)
def format_docs(docs, doc_fn=format_doc_minimal, **kwargs):
return "\n---\n".join([doc_fn(d, **kwargs) for d in docs])