mtcute/packages/html-parser/src/index.ts

439 lines
14 KiB
TypeScript
Raw Normal View History

import { Parser } from 'htmlparser2'
import Long from 'long'
import type {
FormattedString,
IMessageEntityParser,
MessageEntity,
tl,
2021-08-05 20:38:24 +03:00
} from '@mtcute/client'
2021-04-08 12:19:38 +03:00
const MENTION_REGEX =
/^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/
2021-04-08 12:19:38 +03:00
/**
* Tagged template based helper for escaping entities in HTML
*
* @example
* ```typescript
* const escaped = html`<b>${user.displayName}</b>`
* ```
*/
export function html(
strings: TemplateStringsArray,
...sub: (string | FormattedString<'html'> | boolean | undefined | null)[]
): FormattedString<'html'> {
let str = ''
sub.forEach((it, idx) => {
if (typeof it === 'boolean' || !it) return
if (typeof it === 'string') {
it = HtmlMessageEntityParser.escape(
it,
Boolean(str.match(/=['"]$/)),
)
} else {
if (it.mode && it.mode !== 'html') {
throw new Error(`Incompatible parse mode: ${it.mode}`)
}
it = it.value
}
str += strings[idx] + it
})
return { value: str + strings[strings.length - 1], mode: 'html' }
}
/**
* Syntax highlighter function used in {@link HtmlMessageEntityParser.unparse}
*
* Must be sync (this might change in the future) and must return valid HTML.
*/
export type SyntaxHighlighter = (code: string, language: string) => string
2021-04-08 12:19:38 +03:00
export interface HtmlMessageEntityParserOptions {
syntaxHighlighter?: SyntaxHighlighter
2021-04-08 12:19:38 +03:00
}
/**
* HTML MessageEntity parser.
*
* This class implements syntax very similar to one available
* in the Bot API ([documented here](https://core.telegram.org/bots/api#html-style))
* with some slight differences.
*/
export class HtmlMessageEntityParser implements IMessageEntityParser {
name = 'html'
private readonly _syntaxHighlighter?: SyntaxHighlighter
2021-04-08 12:19:38 +03:00
constructor(options?: HtmlMessageEntityParserOptions) {
2021-04-08 12:19:38 +03:00
this._syntaxHighlighter = options?.syntaxHighlighter
}
/**
* Escape the string so it can be safely used inside HTML
*
* @param str String to be escaped
* @param quote Whether `"` (double quote) should be escaped as `&quot;`
*/
static escape(str: string, quote = false): string {
str = str
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
if (quote) str = str.replace(/"/g, '&quot;')
return str
}
parse(text: string): [string, tl.TypeMessageEntity[]] {
const stacks: Record<string, tl.Mutable<tl.TypeMessageEntity>[]> = {}
const entities: tl.TypeMessageEntity[] = []
let plainText = ''
let pendingText = ''
function processPendingText(tagEnd = false) {
if (!pendingText.length) return
2023-09-21 02:53:08 +03:00
if (!stacks.pre?.length) {
pendingText = pendingText.replace(/[^\S\u00A0]+/gs, ' ')
if (tagEnd) pendingText = pendingText.trimEnd()
if (!plainText.length || plainText.match(/\s$/)) {
pendingText = pendingText.trimStart()
}
}
for (const ents of Object.values(stacks)) {
for (const ent of ents) {
ent.length += pendingText.length
}
}
plainText += pendingText
pendingText = ''
}
2021-04-08 12:19:38 +03:00
const parser = new Parser({
onopentag(name, attribs) {
name = name.toLowerCase()
processPendingText()
// ignore tags inside pre (except pre)
2023-09-21 02:53:08 +03:00
if (name !== 'pre' && stacks.pre?.length) return
2021-04-08 12:19:38 +03:00
let entity: tl.TypeMessageEntity
switch (name) {
case 'br':
plainText += '\n'
return
case 'b':
case 'strong':
entity = {
_: 'messageEntityBold',
offset: plainText.length,
length: 0,
}
break
case 'i':
case 'em':
entity = {
_: 'messageEntityItalic',
offset: plainText.length,
length: 0,
}
break
case 'u':
entity = {
_: 'messageEntityUnderline',
offset: plainText.length,
length: 0,
}
break
case 's':
case 'del':
case 'strike':
entity = {
_: 'messageEntityStrike',
offset: plainText.length,
length: 0,
}
break
case 'blockquote':
entity = {
_: 'messageEntityBlockquote',
offset: plainText.length,
length: 0,
}
break
case 'code':
entity = {
_: 'messageEntityCode',
offset: plainText.length,
length: 0,
}
break
case 'pre':
entity = {
_: 'messageEntityPre',
offset: plainText.length,
length: 0,
language: attribs.language ?? '',
}
break
case 'spoiler':
case 'tg-spoiler':
entity = {
_: 'messageEntitySpoiler',
offset: plainText.length,
length: 0,
}
break
case 'emoji':
case 'tg-emoji': {
const id = attribs.id || attribs['emoji-id']
if (!id || !id.match(/^-?\d+$/)) return
entity = {
_: 'messageEntityCustomEmoji',
offset: plainText.length,
length: 0,
documentId: Long.fromString(id),
}
break
}
case 'a': {
let url = attribs.href
if (!url) return
2021-04-08 12:19:38 +03:00
const mention = MENTION_REGEX.exec(url)
if (mention) {
const id = parseInt(mention[1])
const accessHash = mention[2]
if (accessHash) {
entity = {
_: 'inputMessageEntityMentionName',
offset: plainText.length,
length: 0,
userId: {
_: 'inputUser',
userId: id,
accessHash: Long.fromString(
accessHash,
false,
16,
),
},
}
} else {
entity = {
_: 'messageEntityMentionName',
offset: plainText.length,
length: 0,
userId: id,
}
2021-04-08 12:19:38 +03:00
}
} else {
if (url.match(/^\/\//)) url = 'http:' + url
2021-04-08 12:19:38 +03:00
entity = {
_: 'messageEntityTextUrl',
2021-04-08 12:19:38 +03:00
offset: plainText.length,
length: 0,
url,
2021-04-08 12:19:38 +03:00
}
}
break
}
default:
return
}
2021-04-08 12:19:38 +03:00
if (!(name in stacks)) {
stacks[name] = []
}
stacks[name].push(entity)
},
onclosetag(name: string) {
processPendingText(true)
name = name.toLowerCase()
// ignore tags inside pre (except pre)
2023-09-21 02:53:08 +03:00
if (name !== 'pre' && stacks.pre?.length) return
2023-09-21 02:53:08 +03:00
const entity = stacks[name]?.pop()
2021-04-08 12:19:38 +03:00
if (!entity) return // unmatched close tag
// ignore nested pre-s
2023-09-21 02:53:08 +03:00
if (name !== 'pre' || !stacks.pre?.length) {
entities.push(entity)
}
},
ontext(data) {
pendingText += data
2021-04-08 12:19:38 +03:00
},
})
parser.write(text)
processPendingText(true)
return [plainText.replace(/\u00A0/g, ' '), entities]
2021-04-08 12:19:38 +03:00
}
unparse(text: string, entities: ReadonlyArray<MessageEntity>): string {
2021-04-08 12:19:38 +03:00
return this._unparse(text, entities)
}
// internal function that uses recursion to correctly process nested & overlapping entities
private _unparse(
text: string,
entities: ReadonlyArray<MessageEntity>,
2021-04-08 12:19:38 +03:00
entitiesOffset = 0,
offset = 0,
length = text.length,
2021-04-08 12:19:38 +03:00
): string {
if (!text) return text
2021-04-08 12:19:38 +03:00
if (!entities.length || entities.length === entitiesOffset) {
return HtmlMessageEntityParser.escape(text)
2022-05-06 00:36:54 +03:00
.replace(/\n/g, '<br>')
.replace(/ {2,}/g, (match) => {
return '&nbsp;'.repeat(match.length)
})
2021-04-08 12:19:38 +03:00
}
const end = offset + length
const html: string[] = []
let lastOffset = 0
for (let i = entitiesOffset; i < entities.length; i++) {
const entity = entities[i]
if (entity.offset >= end) break
let entOffset = entity.offset
let length = entity.length
2021-04-08 12:19:38 +03:00
if (entOffset < 0) {
length += entOffset
entOffset = 0
}
let relativeOffset = entOffset - offset
2021-04-08 12:19:38 +03:00
if (relativeOffset > lastOffset) {
// add missing plain text
html.push(
HtmlMessageEntityParser.escape(
text.substring(lastOffset, relativeOffset),
),
2021-04-08 12:19:38 +03:00
)
} else if (relativeOffset < lastOffset) {
length -= lastOffset - relativeOffset
relativeOffset = lastOffset
}
if (length <= 0 || relativeOffset >= end || relativeOffset < 0) {
continue
}
2021-04-08 12:19:38 +03:00
let skip = false
const substr = text.substr(relativeOffset, length)
if (!substr) continue
const type = entity.type
2022-05-06 00:36:54 +03:00
let entityText
if (type === 'pre') {
entityText = substr
} else {
entityText = this._unparse(
substr,
entities,
i + 1,
offset + relativeOffset,
length,
2022-05-06 00:36:54 +03:00
)
}
switch (type) {
case 'bold':
case 'italic':
case 'underline':
case 'strikethrough':
html.push(`<${type[0]}>${entityText}</${type[0]}>`)
break
case 'code':
case 'pre':
html.push(
`<${type}${
entity.language ?
` language="${entity.language}"` :
''
}>${
this._syntaxHighlighter && entity.language ?
this._syntaxHighlighter(
entityText,
entity.language,
) :
entityText
}</${type}>`,
)
break
case 'blockquote':
case 'spoiler':
html.push(`<${type}>${entityText}</${type}>`)
break
case 'email':
html.push(
`<a href="mailto:${entityText}">${entityText}</a>`,
)
break
case 'url':
html.push(`<a href="${entityText}">${entityText}</a>`)
break
case 'text_link':
html.push(
`<a href="${HtmlMessageEntityParser.escape(
// todo improve typings
entity.url!,
true,
)}">${entityText}</a>`,
)
break
case 'text_mention':
html.push(
// todo improve typings
`<a href="tg://user?id=${entity.userId!}">${entityText}</a>`,
)
break
default:
skip = true
break
}
2021-04-08 12:19:38 +03:00
lastOffset = relativeOffset + (skip ? 0 : length)
}
html.push(HtmlMessageEntityParser.escape(text.substr(lastOffset)))
return html.join('')
}
}