Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import type { SerializedHTMLElement } from "../scrape/types"; | |
| import { htmlElementToMarkdownElements, mergeAdjacentElements } from "./fromHtml"; | |
| import type { HeaderElement, MarkdownElement } from "./types"; | |
| import { MarkdownElementType } from "./types"; | |
| import { chunkElements } from "./utils/chunk"; | |
| /** | |
| * Converts HTML elements to Markdown elements and creates a tree based on header tags | |
| * For example: h1 [h2 [p p blockquote] h2 [h3 [...] ] ] | |
| **/ | |
| export function htmlToMarkdownTree( | |
| title: string, | |
| htmlElements: SerializedHTMLElement[], | |
| maxCharsPerElem: number | |
| ): HeaderElement { | |
| let parent: HeaderElement = { | |
| type: MarkdownElementType.Header, | |
| level: 1, | |
| parent: null, | |
| content: title, | |
| children: [], | |
| }; | |
| const markdownElements = chunkElements( | |
| mergeAdjacentElements( | |
| htmlElements.flatMap((elem) => htmlElementToMarkdownElements(parent, elem)) | |
| ), | |
| maxCharsPerElem | |
| ); | |
| for (const elem of markdownElements) { | |
| if (elem.type !== MarkdownElementType.Header) { | |
| elem.parent = parent; | |
| parent.children.push(elem); | |
| continue; | |
| } | |
| // add 1 to current level to offset for the title being level 1 | |
| elem.level += 1; | |
| // Pop up header levels until reaching the same level as the current header | |
| // or until we reach the root | |
| inner: while (parent !== null && parent.parent !== null) { | |
| if (parent.level < elem.level) break inner; | |
| parent = parent.parent; | |
| } | |
| parent.children.push(elem); | |
| parent = elem; | |
| } | |
| // Pop up to the root | |
| while (parent.parent !== null) { | |
| parent = parent.parent; | |
| } | |
| return parent; | |
| } | |
| export function removeParents<T extends MarkdownElement>(elem: T): T { | |
| if ("children" in elem) { | |
| return { ...elem, parent: null, children: elem.children.map((child) => removeParents(child)) }; | |
| } | |
| return { ...elem, parent: null }; | |
| } | |