/** * Chat parser * Pokemon Showdown - http://pokemonshowdown.com/ * * Parses formate. * * @license MIT */ /* SOURCE FOR LINKREGEX (compile with https://regexfree.k55.io/ ) ( ( # When using http://, allow any domain https?:\/\/ [a-z0-9-]+ ( \. [a-z0-9-]+ )* | # When using www., expect at least one more dot www \. [a-z0-9-]+ ( \. [a-z0-9-]+ )+ | # Otherwise, allow any domain, but only if \b [a-z0-9-]+ ( \. [a-z0-9-]+ )* \. ( # followed either a common TLD... ( com? | org | net | edu | info | us | jp ) \b | # or any 2-3 letter TLD followed by a port or / [a-z]{2,3} (?= :[0-9] | / ) ) ) # possible custom port ( : [0-9]+ )? ( \/ ( # characters allowed inside URL paths ( [^\s()&<>[\]] | & | " | # parentheses in URLs should be matched, so they're not confused # for parentheses around URLs \( ( [^\s()<>&[\]] | & )* \) | \[ ( [^\s()<>&[\]] | & )* ] )* # URLs usually don't end with punctuation, so don't allow # punctuation symbols that probably arent related to URL. ( [^\s()[\]{}\".,!?;:&<>*`^~\\] | # annoyingly, Wikipedia URLs often end in ) \( ( [^\s()<>&[\]] | & )* \) ) )? )? | # email address [a-z0-9.]+ @ [a-z0-9-]+ ( \. [a-z0-9-]+ )* \. [a-z]{2,} ) (?! [^ ]*> ) */ export const linkRegex = /(?:(?:https?:\/\/[a-z0-9-]+(?:\.[a-z0-9-]+)*|www\.[a-z0-9-]+(?:\.[a-z0-9-]+)+|\b[a-z0-9-]+(?:\.[a-z0-9-]+)*\.(?:(?:com?|org|net|edu|info|us|jp)\b|[a-z]{2,3}(?=:[0-9]|\/)))(?::[0-9]+)?(?:\/(?:(?:[^\s()&<>[\]]|&|"|\((?:[^\s()<>&[\]]|&)*\)|\[(?:[^\s()<>&[\]]|&)*])*(?:[^\s()[\]{}".,!?;:&<>*`^~\\]|\((?:[^\s()<>&[\]]|&)*\)))?)?|[a-z0-9.]+@[a-z0-9-]+(?:\.[a-z0-9-]+)*\.[a-z]{2,})(?![^ ]*>)/ig; /** * A span is a part of the text that's formatted. In the text: * * Hi, **this** is an example. * * The word `this` is a `*` span. Many spans are just a symbol repeated, and * that symbol is the span type, but also many are more complicated. * For an explanation of all of these, see the `TextFormatter#get` function * implementation. */ type SpanType = '_' | '*' | '~' | '^' | '\\' | '|' | '<' | '[' | '`' | 'a' | 'u' | 'spoiler' | '>' | '('; type FormatSpan = [SpanType, number]; class TextFormatter { readonly str: string; readonly buffers: string[]; readonly stack: FormatSpan[]; /** Allows access to special formatting (links without URL preview, pokemon icons) */ readonly isTrusted: boolean; /** Replace \n with
*/ readonly replaceLinebreaks: boolean; /** Discord-style WYSIWYM output; markup characters are in `` */ readonly showSyntax: boolean; /** offset of str that's been parsed so far */ offset: number; constructor(str: string, isTrusted = false, replaceLinebreaks = false, showSyntax = false) { // escapeHTML, without escaping / str = `${str}` .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); // filter links first str = str.replace(linkRegex, uri => { if (showSyntax) return `${uri}`; let fulluri; if (/^[a-z0-9.]+@/ig.test(uri)) { fulluri = 'mailto:' + uri; } else { fulluri = uri.replace(/^([a-z]*[^a-z:])/g, 'http://$1'); if (uri.substr(0, 24) === 'https://docs.google.com/' || uri.substr(0, 16) === 'docs.google.com/') { if (uri.startsWith('https')) uri = uri.slice(8); if (uri.substr(-12) === '?usp=sharing' || uri.substr(-12) === '&usp=sharing') uri = uri.slice(0, -12); if (uri.substr(-6) === '#gid=0') uri = uri.slice(0, -6); let slashIndex = uri.lastIndexOf('/'); if (uri.length - slashIndex > 18) slashIndex = uri.length; if (slashIndex - 4 > 19 + 3) { uri = `${uri.slice(0, 19)}${uri.slice(19, slashIndex - 4)}` + `${uri.slice(slashIndex - 4)}`; } } } return `${uri}`; }); // (links don't have any specific syntax, they're just a pattern, so we detect them in a separate pass) this.str = str; this.buffers = []; this.stack = []; this.isTrusted = isTrusted; this.replaceLinebreaks = this.isTrusted || replaceLinebreaks; this.showSyntax = showSyntax; this.offset = 0; } // debugAt(i=0, j=i+1) { console.log(`${this.slice(0, i)}[${this.slice(i, j)}]${this.slice(j, this.str.length)}`); } slice(start: number, end: number) { return this.str.slice(start, end); } at(start: number) { return this.str.charAt(start); } /** * We've encountered a possible start for a span. It's pushed onto our span * stack. * * The span stack saves the start position so it can be replaced with HTML * if we find an end for the span, but we don't actually replace it until * `closeSpan` is called, so nothing happens (it stays plaintext) if no end * is found. */ pushSpan(spanType: SpanType, start: number, end: number) { this.pushSlice(start); this.stack.push([spanType, this.buffers.length]); this.buffers.push(this.slice(start, end)); this.offset = end; } pushSlice(end: number) { if (end !== this.offset) { this.buffers.push(this.slice(this.offset, end)); this.offset = end; } } closeParenSpan(start: number) { let stackPosition = -1; for (let i = this.stack.length - 1; i >= 0; i--) { const span = this.stack[i]; if (span[0] === '(') { stackPosition = i; break; } if (span[0] !== 'spoiler') break; } if (stackPosition === -1) return false; this.pushSlice(start); while (this.stack.length > stackPosition) this.popSpan(start); this.offset = start; return true; } /** * We've encountered a possible end for a span. If it's in the span stack, * we transform it into HTML. */ closeSpan(spanType: SpanType, start: number, end: number) { // loop backwards let stackPosition = -1; for (let i = this.stack.length - 1; i >= 0; i--) { const span = this.stack[i]; if (span[0] === spanType) { stackPosition = i; break; } } if (stackPosition === -1) return false; this.pushSlice(start); while (this.stack.length > stackPosition + 1) this.popSpan(start); const span = this.stack.pop()!; const startIndex = span[1]; let tagName = ''; let attrs = ''; switch (spanType) { case '_': tagName = 'i'; break; case '*': tagName = 'b'; break; case '~': tagName = 's'; break; case '^': tagName = 'sup'; break; case '\\': tagName = 'sub'; break; case '|': tagName = 'span'; attrs = (this.showSyntax ? ' class="spoiler-shown"' : ' class="spoiler"'); break; } const syntax = (this.showSyntax ? `${spanType}${spanType}` : ''); if (tagName) { this.buffers[startIndex] = `${syntax}<${tagName}${attrs}>`; this.buffers.push(`${syntax}`); this.offset = end; } return true; } /** * Ends a span without an ending symbol. For most spans, this means * they don't take effect, but certain spans like spoiler tags don't * require ending symbols. */ popSpan(end: number) { const span = this.stack.pop(); if (!span) return false; this.pushSlice(end); switch (span[0]) { case 'spoiler': this.buffers.push(``); this.buffers[span[1]] = (this.showSyntax ? `` : ``); break; case '>': this.buffers.push(``); this.buffers[span[1]] = ``; break; default: // do nothing break; } return true; } popAllSpans(end: number) { while (this.stack.length) this.popSpan(end); this.pushSlice(end); } toUriComponent(html: string) { const component = html.replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, '\'') .replace(/&/g, '&'); return encodeURIComponent(component); } /** * Handles special cases. */ runLookahead(spanType: SpanType, start: number) { switch (spanType) { case '`': // code span. Not only are the contents not formatted, but // the start and end delimiters must match in length. // ``Neither `this` nor ```this``` end this code span.`` { let delimLength = 0; let i = start; while (this.at(i) === '`') { delimLength++; i++; } let curDelimLength = 0; while (i < this.str.length) { const char = this.at(i); if (char === '\n') break; if (char === '`') { curDelimLength++; } else { if (curDelimLength === delimLength) break; curDelimLength = 0; } i++; } if (curDelimLength !== delimLength) return false; const end = i; // matching delims found this.pushSlice(start); let innerStart = start + delimLength; let innerEnd = i - delimLength; if (innerStart + 1 >= innerEnd) { // no special whitespace handling } else if (this.at(innerStart) === ' ' && this.at(innerEnd - 1) === ' ') { innerStart++; // strip starting and ending space innerEnd--; } else if (this.at(innerStart) === ' ' && this.at(innerStart + 1) === '`') { innerStart++; // strip starting space } else if (this.at(innerEnd - 1) === ' ' && this.at(innerEnd - 2) === '`') { innerEnd--; // strip ending space } if (this.showSyntax) this.buffers.push(`${this.slice(start, innerStart)}`); this.buffers.push(``); this.buffers.push(this.slice(innerStart, innerEnd)); this.buffers.push(``); if (this.showSyntax) this.buffers.push(`${this.slice(innerEnd, end)}`); this.offset = end; } return true; case '[': // Link span. Several possiblilities: // [[text ]] - a link with custom text // [[search term]] - Google search // [[wiki: search term]] - Wikipedia search // [[pokemon: species name]] - icon (also item:, type:, category:) { if (this.slice(start, start + 2) !== '[[') return false; let i = start + 2; let colonPos = -1; // `:` let anglePos = -1; // `<` while (i < this.str.length) { const char = this.at(i); if (char === ']' || char === '\n') break; if (char === ':' && colonPos < 0) colonPos = i; if (char === '&' && this.slice(i, i + 4) === '<') anglePos = i; i++; } if (this.slice(i, i + 2) !== ']]') return false; this.pushSlice(start); this.offset = i + 2; let termEnd = i; let uri = ''; if (anglePos >= 0 && this.slice(i - 4, i) === '>') { // `>` uri = this.slice(anglePos + 4, i - 4); termEnd = anglePos; if (this.at(termEnd - 1) === ' ') termEnd--; uri = encodeURI(uri.replace(/^([a-z]*[^a-z:])/g, 'http://$1')); } let term = this.slice(start + 2, termEnd).replace(/<\/?[au](?: [^>]+)?>/g, ''); if (this.showSyntax) { term += `${this.slice(termEnd, i)}`; } else if (uri && !this.isTrusted) { const shortUri = uri.replace(/^https?:\/\//, '').replace(/^www\./, '').replace(/\/$/, ''); term += ` <${shortUri}>`; uri += '" rel="noopener'; } if (colonPos > 0) { const key = this.slice(start + 2, colonPos).toLowerCase(); switch (key) { case 'w': case 'wiki': if (this.showSyntax) break; term = term.slice(term.charAt(key.length + 1) === ' ' ? key.length + 2 : key.length + 1); uri = `//en.wikipedia.org/w/index.php?title=Special:Search&search=${this.toUriComponent(term)}`; term = `wiki: ${term}`; break; case 'pokemon': case 'item': case 'type': case 'category': if (this.showSyntax) { this.buffers.push(`${this.slice(start, this.offset)}`); return true; } term = term.slice(term.charAt(key.length + 1) === ' ' ? key.length + 2 : key.length + 1); let display = ''; if (this.isTrusted) { display = ``; } else { display = `[${term}]`; } let dir = key; if (key === 'item') dir += 's'; if (key === 'category') dir = 'categories' as 'category'; uri = `//dex.pokemonshowdown.com/${dir}/${toID(term)}`; term = display; } } if (!uri) { uri = `//www.google.com/search?ie=UTF-8&btnI&q=${this.toUriComponent(term)}`; } if (this.showSyntax) { this.buffers.push(`[[${term}]]`); } else { this.buffers.push(`${term}`); } } return true; case '<': // Roomid-link span. Not to be confused with a URL span. // `<>` { if (this.slice(start, start + 8) !== '<<') return false; // << let i = start + 8; while (/[a-z0-9-]/.test(this.at(i))) i++; if (this.slice(i, i + 8) !== '>>') return false; // >> this.pushSlice(start); const roomid = this.slice(start + 8, i); if (this.showSyntax) { this.buffers.push(`<<${roomid}>>`); } else { this.buffers.push(`«${roomid}»`); } this.offset = i + 8; } return true; case 'a': case 'u': // URL span. Skip to the end of the link - where `` or `` is. // Nothing inside should be formatted further (obviously we don't want // `example.com/__foo__` to turn `foo` italic). { let i = start + 2; // Find or . // We need to check the location of `>` to disambiguate from . while (this.at(i) !== '<' || this.at(i + 1) !== '/' || this.at(i + 3) !== '>') i++; i += 4; this.pushSlice(i); } return true; } return false; } get() { let beginningOfLine = this.offset; // main loop! `i` tracks our position // Note that we skip around a lot; `i` is mutated inside the loop // pretty often. for (let i = beginningOfLine; i < this.str.length; i++) { const char = this.at(i); switch (char) { case '_': case '*': case '~': case '^': case '\\': case '|': // Must be exactly two chars long. if (this.at(i + 1) === char && this.at(i + 2) !== char) { // This is a completely normal two-char span. Close it if it's // already open, open it if it's not. // The inside of regular spans must not start or end with a space. if (!(this.at(i - 1) !== ' ' && this.closeSpan(char, i, i + 2))) { if (this.at(i + 2) !== ' ') this.pushSpan(char, i, i + 2); } if (i < this.offset) { i = this.offset - 1; break; } } while (this.at(i + 1) === char) i++; break; case '(': // `(` span - does nothing except end spans this.stack.push(['(', -1]); break; case ')': // end of `(` span this.closeParenSpan(i); if (i < this.offset) { i = this.offset - 1; break; } break; case '`': // ` ``code`` ` span. Uses lookahead because its contents are not // formatted. // Must be at least two `` ` `` in a row. if (this.at(i + 1) === '`') this.runLookahead('`', i); if (i < this.offset) { i = this.offset - 1; break; } while (this.at(i + 1) === '`') i++; break; case '[': // `[` (link) span. Uses lookahead because it might contain a // URL which can't be formatted, or search terms that can't be // formatted. this.runLookahead('[', i); if (i < this.offset) { i = this.offset - 1; break; } while (this.at(i + 1) === '[') i++; break; case ':': // Looks behind for `spoiler:` or `spoilers:`. Spoiler spans // are also weird because they don't require an ending symbol, // although that's not handled here. if (i < 7) break; if (this.slice(i - 7, i + 1).toLowerCase() === 'spoiler:' || this.slice(i - 8, i + 1).toLowerCase() === 'spoilers:') { if (this.at(i + 1) === ' ') i++; this.pushSpan('spoiler', i + 1, i + 1); } break; case '&': // escaped '<' or '>' // greentext or roomid if (i === beginningOfLine && this.slice(i, i + 4) === '>') { // greentext span, normal except it lacks an ending span // check for certain emoticons like `>_>` or `>w<` if (!"._/=:;".includes(this.at(i + 4)) && !['w<', 'w>'].includes(this.slice(i + 4, i + 9))) { this.pushSpan('>', i, i); } } else { // completely normal `<>` span // uses lookahead because roomids can't be formatted. this.runLookahead('<', i); } if (i < this.offset) { i = this.offset - 1; break; } while (this.slice(i + 1, i + 5) === 'lt;&') i += 4; break; case '<': // guaranteed to be or // URL span // The constructor has already converted `<` to `<` and URLs // to links, so `<` must be the start of a converted link. this.runLookahead('a', i); if (i < this.offset) { i = this.offset - 1; break; } // should never happen break; case '\r': case '\n': // End of the line. No spans span multiple lines. this.popAllSpans(i); if (this.replaceLinebreaks) { this.buffers.push(`
`); this.offset++; } beginningOfLine = i + 1; break; } } this.popAllSpans(this.str.length); return this.buffers.join(''); } } /** * Takes a string and converts it to HTML by replacing standard chat formatting with the appropriate HTML tags. */ export function formatText(str: string, isTrusted = false, replaceLinebreaks = false, showSyntax = false) { return new TextFormatter(str, isTrusted, replaceLinebreaks, showSyntax).get(); } /** * Takes a string and strips all standard chat formatting except greentext from it, the text of a link is kept. */ export function stripFormatting(str: string) { // Doesn't match > meme arrows because the angle bracket appears in the chat still. str = str.replace(/\*\*([^\s*]+)\*\*|__([^\s_]+)__|~~([^\s~]+)~~|``([^\s`]+)``|\^\^([^\s^]+)\^\^|\\([^\s\\]+)\\/g, (match, $1, $2, $3, $4, $5, $6) => $1 || $2 || $3 || $4 || $5 || $6); // Remove all of the link expect for the text in [[text]] return str.replace(/\[\[(?:([^<]*)\s*<[^>]+>|([^\]]+))\]\]/g, (match, $1, $2) => $1 || $2 || ''); }