/**
* Chat parser
* Pokemon Showdown - http://pokemonshowdown.com/
*
* Parses formate.
*
* @license MIT
*/
/*
SOURCE FOR LINKREGEX (compile with https://regexfree.k55.io/ )
(
(
# When using http://, allow any domain
https?:\/\/ [a-z0-9-]+ ( \. [a-z0-9-]+ )*
|
# When using www., expect at least one more dot
www \. [a-z0-9-]+ ( \. [a-z0-9-]+ )+
|
# Otherwise, allow any domain, but only if
\b [a-z0-9-]+ ( \. [a-z0-9-]+ )* \.
(
# followed either a common TLD...
( com? | org | net | edu | info | us | jp ) \b
|
# or any 2-3 letter TLD followed by a port or /
[a-z]{2,3} (?= :[0-9] | / )
)
)
# possible custom port
( : [0-9]+ )?
(
\/
(
# characters allowed inside URL paths
(
[^\s()&<>[\]] | & | "
|
# parentheses in URLs should be matched, so they're not confused
# for parentheses around URLs
\( ( [^\s()<>&[\]] | & )* \)
|
\[ ( [^\s()<>&[\]] | & )* ]
)*
# URLs usually don't end with punctuation, so don't allow
# punctuation symbols that probably arent related to URL.
(
[^\s()[\]{}\".,!?;:&<>*`^~\\]
|
# annoyingly, Wikipedia URLs often end in )
\( ( [^\s()<>&[\]] | & )* \)
)
)?
)?
|
# email address
[a-z0-9.]+ @ [a-z0-9-]+ ( \. [a-z0-9-]+ )* \. [a-z]{2,}
)
(?! [^ ]*> )
*/
export const linkRegex = /(?:(?:https?:\/\/[a-z0-9-]+(?:\.[a-z0-9-]+)*|www\.[a-z0-9-]+(?:\.[a-z0-9-]+)+|\b[a-z0-9-]+(?:\.[a-z0-9-]+)*\.(?:(?:com?|org|net|edu|info|us|jp)\b|[a-z]{2,3}(?=:[0-9]|\/)))(?::[0-9]+)?(?:\/(?:(?:[^\s()&<>[\]]|&|"|\((?:[^\s()<>&[\]]|&)*\)|\[(?:[^\s()<>&[\]]|&)*])*(?:[^\s()[\]{}".,!?;:&<>*`^~\\]|\((?:[^\s()<>&[\]]|&)*\)))?)?|[a-z0-9.]+@[a-z0-9-]+(?:\.[a-z0-9-]+)*\.[a-z]{2,})(?![^ ]*>)/ig;
/**
* A span is a part of the text that's formatted. In the text:
*
* Hi, **this** is an example.
*
* The word `this` is a `*` span. Many spans are just a symbol repeated, and
* that symbol is the span type, but also many are more complicated.
* For an explanation of all of these, see the `TextFormatter#get` function
* implementation.
*/
type SpanType = '_' | '*' | '~' | '^' | '\\' | '|' | '<' | '[' | '`' | 'a' | 'u' | 'spoiler' | '>' | '(';
type FormatSpan = [SpanType, number];
class TextFormatter {
readonly str: string;
readonly buffers: string[];
readonly stack: FormatSpan[];
/** Allows access to special formatting (links without URL preview, pokemon icons) */
readonly isTrusted: boolean;
/** Replace \n with
*/
readonly replaceLinebreaks: boolean;
/** Discord-style WYSIWYM output; markup characters are in `` */
readonly showSyntax: boolean;
/** offset of str that's been parsed so far */
offset: number;
constructor(str: string, isTrusted = false, replaceLinebreaks = false, showSyntax = false) {
// escapeHTML, without escaping /
str = `${str}`
.replace(/&/g, '&')
.replace(//g, '>')
.replace(/"/g, '"')
.replace(/'/g, ''');
// filter links first
str = str.replace(linkRegex, uri => {
if (showSyntax) return `${uri}`;
let fulluri;
if (/^[a-z0-9.]+@/ig.test(uri)) {
fulluri = 'mailto:' + uri;
} else {
fulluri = uri.replace(/^([a-z]*[^a-z:])/g, 'http://$1');
if (uri.substr(0, 24) === 'https://docs.google.com/' || uri.substr(0, 16) === 'docs.google.com/') {
if (uri.startsWith('https')) uri = uri.slice(8);
if (uri.substr(-12) === '?usp=sharing' || uri.substr(-12) === '&usp=sharing') uri = uri.slice(0, -12);
if (uri.substr(-6) === '#gid=0') uri = uri.slice(0, -6);
let slashIndex = uri.lastIndexOf('/');
if (uri.length - slashIndex > 18) slashIndex = uri.length;
if (slashIndex - 4 > 19 + 3) {
uri = `${uri.slice(0, 19)}${uri.slice(19, slashIndex - 4)}` +
`${uri.slice(slashIndex - 4)}`;
}
}
}
return `${uri}`;
});
// (links don't have any specific syntax, they're just a pattern, so we detect them in a separate pass)
this.str = str;
this.buffers = [];
this.stack = [];
this.isTrusted = isTrusted;
this.replaceLinebreaks = this.isTrusted || replaceLinebreaks;
this.showSyntax = showSyntax;
this.offset = 0;
}
// debugAt(i=0, j=i+1) { console.log(`${this.slice(0, i)}[${this.slice(i, j)}]${this.slice(j, this.str.length)}`); }
slice(start: number, end: number) {
return this.str.slice(start, end);
}
at(start: number) {
return this.str.charAt(start);
}
/**
* We've encountered a possible start for a span. It's pushed onto our span
* stack.
*
* The span stack saves the start position so it can be replaced with HTML
* if we find an end for the span, but we don't actually replace it until
* `closeSpan` is called, so nothing happens (it stays plaintext) if no end
* is found.
*/
pushSpan(spanType: SpanType, start: number, end: number) {
this.pushSlice(start);
this.stack.push([spanType, this.buffers.length]);
this.buffers.push(this.slice(start, end));
this.offset = end;
}
pushSlice(end: number) {
if (end !== this.offset) {
this.buffers.push(this.slice(this.offset, end));
this.offset = end;
}
}
closeParenSpan(start: number) {
let stackPosition = -1;
for (let i = this.stack.length - 1; i >= 0; i--) {
const span = this.stack[i];
if (span[0] === '(') {
stackPosition = i;
break;
}
if (span[0] !== 'spoiler') break;
}
if (stackPosition === -1) return false;
this.pushSlice(start);
while (this.stack.length > stackPosition) this.popSpan(start);
this.offset = start;
return true;
}
/**
* We've encountered a possible end for a span. If it's in the span stack,
* we transform it into HTML.
*/
closeSpan(spanType: SpanType, start: number, end: number) {
// loop backwards
let stackPosition = -1;
for (let i = this.stack.length - 1; i >= 0; i--) {
const span = this.stack[i];
if (span[0] === spanType) {
stackPosition = i;
break;
}
}
if (stackPosition === -1) return false;
this.pushSlice(start);
while (this.stack.length > stackPosition + 1) this.popSpan(start);
const span = this.stack.pop()!;
const startIndex = span[1];
let tagName = '';
let attrs = '';
switch (spanType) {
case '_': tagName = 'i'; break;
case '*': tagName = 'b'; break;
case '~': tagName = 's'; break;
case '^': tagName = 'sup'; break;
case '\\': tagName = 'sub'; break;
case '|': tagName = 'span'; attrs = (this.showSyntax ? ' class="spoiler-shown"' : ' class="spoiler"'); break;
}
const syntax = (this.showSyntax ? `${spanType}${spanType}` : '');
if (tagName) {
this.buffers[startIndex] = `${syntax}<${tagName}${attrs}>`;
this.buffers.push(`${tagName}>${syntax}`);
this.offset = end;
}
return true;
}
/**
* Ends a span without an ending symbol. For most spans, this means
* they don't take effect, but certain spans like spoiler tags don't
* require ending symbols.
*/
popSpan(end: number) {
const span = this.stack.pop();
if (!span) return false;
this.pushSlice(end);
switch (span[0]) {
case 'spoiler':
this.buffers.push(``);
this.buffers[span[1]] = (this.showSyntax ? `` : ``);
break;
case '>':
this.buffers.push(``);
this.buffers[span[1]] = ``;
break;
default:
// do nothing
break;
}
return true;
}
popAllSpans(end: number) {
while (this.stack.length) this.popSpan(end);
this.pushSlice(end);
}
toUriComponent(html: string) {
const component = html.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, '\'')
.replace(/&/g, '&');
return encodeURIComponent(component);
}
/**
* Handles special cases.
*/
runLookahead(spanType: SpanType, start: number) {
switch (spanType) {
case '`':
// code span. Not only are the contents not formatted, but
// the start and end delimiters must match in length.
// ``Neither `this` nor ```this``` end this code span.``
{
let delimLength = 0;
let i = start;
while (this.at(i) === '`') {
delimLength++;
i++;
}
let curDelimLength = 0;
while (i < this.str.length) {
const char = this.at(i);
if (char === '\n') break;
if (char === '`') {
curDelimLength++;
} else {
if (curDelimLength === delimLength) break;
curDelimLength = 0;
}
i++;
}
if (curDelimLength !== delimLength) return false;
const end = i;
// matching delims found
this.pushSlice(start);
let innerStart = start + delimLength;
let innerEnd = i - delimLength;
if (innerStart + 1 >= innerEnd) {
// no special whitespace handling
} else if (this.at(innerStart) === ' ' && this.at(innerEnd - 1) === ' ') {
innerStart++; // strip starting and ending space
innerEnd--;
} else if (this.at(innerStart) === ' ' && this.at(innerStart + 1) === '`') {
innerStart++; // strip starting space
} else if (this.at(innerEnd - 1) === ' ' && this.at(innerEnd - 2) === '`') {
innerEnd--; // strip ending space
}
if (this.showSyntax) this.buffers.push(`${this.slice(start, innerStart)}`);
this.buffers.push(``);
this.buffers.push(this.slice(innerStart, innerEnd));
this.buffers.push(`
`);
if (this.showSyntax) this.buffers.push(`${this.slice(innerEnd, end)}`);
this.offset = end;
}
return true;
case '[':
// Link span. Several possiblilities:
// [[text ]] - a link with custom text
// [[search term]] - Google search
// [[wiki: search term]] - Wikipedia search
// [[pokemon: species name]] - icon (also item:, type:, category:)
{
if (this.slice(start, start + 2) !== '[[') return false;
let i = start + 2;
let colonPos = -1; // `:`
let anglePos = -1; // `<`
while (i < this.str.length) {
const char = this.at(i);
if (char === ']' || char === '\n') break;
if (char === ':' && colonPos < 0) colonPos = i;
if (char === '&' && this.slice(i, i + 4) === '<') anglePos = i;
i++;
}
if (this.slice(i, i + 2) !== ']]') return false;
this.pushSlice(start);
this.offset = i + 2;
let termEnd = i;
let uri = '';
if (anglePos >= 0 && this.slice(i - 4, i) === '>') { // `>`
uri = this.slice(anglePos + 4, i - 4);
termEnd = anglePos;
if (this.at(termEnd - 1) === ' ') termEnd--;
uri = encodeURI(uri.replace(/^([a-z]*[^a-z:])/g, 'http://$1'));
}
let term = this.slice(start + 2, termEnd).replace(/<\/?[au](?: [^>]+)?>/g, '');
if (this.showSyntax) {
term += `${this.slice(termEnd, i)}`;
} else if (uri && !this.isTrusted) {
const shortUri = uri.replace(/^https?:\/\//, '').replace(/^www\./, '').replace(/\/$/, '');
term += ` <${shortUri}>`;
uri += '" rel="noopener';
}
if (colonPos > 0) {
const key = this.slice(start + 2, colonPos).toLowerCase();
switch (key) {
case 'w':
case 'wiki':
if (this.showSyntax) break;
term = term.slice(term.charAt(key.length + 1) === ' ' ? key.length + 2 : key.length + 1);
uri = `//en.wikipedia.org/w/index.php?title=Special:Search&search=${this.toUriComponent(term)}`;
term = `wiki: ${term}`;
break;
case 'pokemon':
case 'item':
case 'type':
case 'category':
if (this.showSyntax) {
this.buffers.push(`${this.slice(start, this.offset)}`);
return true;
}
term = term.slice(term.charAt(key.length + 1) === ' ' ? key.length + 2 : key.length + 1);
let display = '';
if (this.isTrusted) {
display = ``;
} else {
display = `[${term}]`;
}
let dir = key;
if (key === 'item') dir += 's';
if (key === 'category') dir = 'categories' as 'category';
uri = `//dex.pokemonshowdown.com/${dir}/${toID(term)}`;
term = display;
}
}
if (!uri) {
uri = `//www.google.com/search?ie=UTF-8&btnI&q=${this.toUriComponent(term)}`;
}
if (this.showSyntax) {
this.buffers.push(`[[${term}]]`);
} else {
this.buffers.push(`${term}`);
}
}
return true;
case '<':
// Roomid-link span. Not to be confused with a URL span.
// `<>`
{
if (this.slice(start, start + 8) !== '<<') return false; // <<
let i = start + 8;
while (/[a-z0-9-]/.test(this.at(i))) i++;
if (this.slice(i, i + 8) !== '>>') return false; // >>
this.pushSlice(start);
const roomid = this.slice(start + 8, i);
if (this.showSyntax) {
this.buffers.push(`<<${roomid}>>`);
} else {
this.buffers.push(`«${roomid}»`);
}
this.offset = i + 8;
}
return true;
case 'a': case 'u':
// URL span. Skip to the end of the link - where `` or `` is.
// Nothing inside should be formatted further (obviously we don't want
// `example.com/__foo__` to turn `foo` italic).
{
let i = start + 2;
// Find or .
// We need to check the location of `>` to disambiguate from .
while (this.at(i) !== '<' || this.at(i + 1) !== '/' || this.at(i + 3) !== '>') i++;
i += 4;
this.pushSlice(i);
}
return true;
}
return false;
}
get() {
let beginningOfLine = this.offset;
// main loop! `i` tracks our position
// Note that we skip around a lot; `i` is mutated inside the loop
// pretty often.
for (let i = beginningOfLine; i < this.str.length; i++) {
const char = this.at(i);
switch (char) {
case '_':
case '*':
case '~':
case '^':
case '\\':
case '|':
// Must be exactly two chars long.
if (this.at(i + 1) === char && this.at(i + 2) !== char) {
// This is a completely normal two-char span. Close it if it's
// already open, open it if it's not.
// The inside of regular spans must not start or end with a space.
if (!(this.at(i - 1) !== ' ' && this.closeSpan(char, i, i + 2))) {
if (this.at(i + 2) !== ' ') this.pushSpan(char, i, i + 2);
}
if (i < this.offset) {
i = this.offset - 1;
break;
}
}
while (this.at(i + 1) === char) i++;
break;
case '(':
// `(` span - does nothing except end spans
this.stack.push(['(', -1]);
break;
case ')':
// end of `(` span
this.closeParenSpan(i);
if (i < this.offset) {
i = this.offset - 1;
break;
}
break;
case '`':
// ` ``code`` ` span. Uses lookahead because its contents are not
// formatted.
// Must be at least two `` ` `` in a row.
if (this.at(i + 1) === '`') this.runLookahead('`', i);
if (i < this.offset) {
i = this.offset - 1;
break;
}
while (this.at(i + 1) === '`') i++;
break;
case '[':
// `[` (link) span. Uses lookahead because it might contain a
// URL which can't be formatted, or search terms that can't be
// formatted.
this.runLookahead('[', i);
if (i < this.offset) {
i = this.offset - 1;
break;
}
while (this.at(i + 1) === '[') i++;
break;
case ':':
// Looks behind for `spoiler:` or `spoilers:`. Spoiler spans
// are also weird because they don't require an ending symbol,
// although that's not handled here.
if (i < 7) break;
if (this.slice(i - 7, i + 1).toLowerCase() === 'spoiler:' ||
this.slice(i - 8, i + 1).toLowerCase() === 'spoilers:') {
if (this.at(i + 1) === ' ') i++;
this.pushSpan('spoiler', i + 1, i + 1);
}
break;
case '&': // escaped '<' or '>'
// greentext or roomid
if (i === beginningOfLine && this.slice(i, i + 4) === '>') {
// greentext span, normal except it lacks an ending span
// check for certain emoticons like `>_>` or `>w<`
if (!"._/=:;".includes(this.at(i + 4)) && !['w<', 'w>'].includes(this.slice(i + 4, i + 9))) {
this.pushSpan('>', i, i);
}
} else {
// completely normal `<>` span
// uses lookahead because roomids can't be formatted.
this.runLookahead('<', i);
}
if (i < this.offset) {
i = this.offset - 1;
break;
}
while (this.slice(i + 1, i + 5) === 'lt;&') i += 4;
break;
case '<': // guaranteed to be or
// URL span
// The constructor has already converted `<` to `<` and URLs
// to links, so `<` must be the start of a converted link.
this.runLookahead('a', i);
if (i < this.offset) {
i = this.offset - 1;
break;
}
// should never happen
break;
case '\r':
case '\n':
// End of the line. No spans span multiple lines.
this.popAllSpans(i);
if (this.replaceLinebreaks) {
this.buffers.push(`
`);
this.offset++;
}
beginningOfLine = i + 1;
break;
}
}
this.popAllSpans(this.str.length);
return this.buffers.join('');
}
}
/**
* Takes a string and converts it to HTML by replacing standard chat formatting with the appropriate HTML tags.
*/
export function formatText(str: string, isTrusted = false, replaceLinebreaks = false, showSyntax = false) {
return new TextFormatter(str, isTrusted, replaceLinebreaks, showSyntax).get();
}
/**
* Takes a string and strips all standard chat formatting except greentext from it, the text of a link is kept.
*/
export function stripFormatting(str: string) {
// Doesn't match > meme arrows because the angle bracket appears in the chat still.
str = str.replace(/\*\*([^\s*]+)\*\*|__([^\s_]+)__|~~([^\s~]+)~~|``([^\s`]+)``|\^\^([^\s^]+)\^\^|\\([^\s\\]+)\\/g,
(match, $1, $2, $3, $4, $5, $6) => $1 || $2 || $3 || $4 || $5 || $6);
// Remove all of the link expect for the text in [[text]]
return str.replace(/\[\[(?:([^<]*)\s*<[^>]+>|([^\]]+))\]\]/g, (match, $1, $2) => $1 || $2 || '');
}