Spaces:
Running
Running
# %% | |
import textwrap | |
from benedict import benedict | |
from langchain_core.documents import Document | |
def _fixed_width_wrap(text, width: int = 70, join_str: str = "\n"): | |
return join_str.join(textwrap.wrap(text, width=width)) | |
def fixed_width_wrap(text, width: int = 70, join_str: str = "\n", split_str="\n"): | |
return join_str.join( | |
[ | |
_fixed_width_wrap(t, width=width, join_str=join_str) | |
for t in text.split(split_str) | |
] | |
) | |
def format_doc_minimal(d, fixed_width=False): | |
if isinstance(d, Document): | |
_cont = d.page_content | |
_meta = benedict(d.metadata) | |
else: | |
_cont = d["page_content"] | |
_meta = benedict(d["metadata"]) | |
if fixed_width: | |
_cont = _fixed_width_wrap(_cont) | |
return """\ | |
Title:\t{title} | |
Published on:\t{pubdate} | |
Source:\t{source_name} ({source_country}) | |
Chunk Content: | |
\t{cont} | |
""".format( | |
d=d, | |
title=_meta.get("title"), | |
pubdate=_meta.get("pubdate"), | |
source_name=_meta.get("source.host") or _meta.get("source.id"), | |
source_country=_meta.get("source.country", "n/a"), | |
cont=_cont, | |
) | |
def format_docs(docs, doc_fn=format_doc_minimal, **kwargs): | |
return "\n---\n".join([doc_fn(d, **kwargs) for d in docs]) | |