mtcute/packages/markdown-parser/src/index.ts

import Long from 'long'

import type { InputText, MessageEntity, TextWithEntities, tl } from '@mtcute/core'

const MENTION_REGEX = /^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/
const EMOJI_REGEX = /^tg:\/\/emoji\?id=(-?\d+)/

const TAG_BOLD = '**'
const TAG_ITALIC = '__'
const TAG_UNDERLINE = '--'
const TAG_STRIKE = '~~'
const TAG_SPOILER = '||'
const TAG_CODE = '`'
const TAG_PRE = '```'

const TO_BE_ESCAPED = /[*_\-~`[\\\]|]/g

/**
 * Escape a string to be safely used in Markdown.
 *
 * > **Note**: this function is in most cases not needed, as `md` function
 * > handles all `string`s passed to it automatically as plain text.
 */
function escape(str: string): string {
    return str.replace(TO_BE_ESCAPED, (s) => '\\' + s)
}

/**
 * Add Markdown formatting to the text given the plain text and entities contained in it.
 */
function unparse(input: InputText): string {
    if (typeof input === 'string') return escape(input)

    let text = input.text
    const entities = input.entities ?? []

    // keep track of positions of inserted escape symbols
    const escaped: number[] = []
    text = text.replace(TO_BE_ESCAPED, (s, pos: number) => {
        escaped.push(pos)

        return '\\' + s
    })
    const hasEscaped = escaped.length > 0

    type InsertLater = [number, string]
    const insert: InsertLater[] = []

    for (const entity of entities) {
        const type = entity._

        let start = entity.offset
        let end = start + entity.length

        if (start > text.length) continue
        if (start < 0) start = 0
        if (end > text.length) end = text.length

        if (hasEscaped) {
            // determine number of escape chars since the beginning of the string
            let escapedPos = 0

            while (escapedPos < escaped.length && escaped[escapedPos] < start) {
                escapedPos += 1
            }
            start += escapedPos

            while (escapedPos < escaped.length && escaped[escapedPos] <= end) {
                escapedPos += 1
            }
            end += escapedPos
        }

        let startTag
        let endTag: string

        switch (type) {
            case 'messageEntityBold':
                startTag = endTag = TAG_BOLD
                break
            case 'messageEntityItalic':
                startTag = endTag = TAG_ITALIC
                break
            case 'messageEntityUnderline':
                startTag = endTag = TAG_UNDERLINE
                break
            case 'messageEntityStrike':
                startTag = endTag = TAG_STRIKE
                break
            case 'messageEntitySpoiler':
                startTag = endTag = TAG_SPOILER
                break
            case 'messageEntityCode':
                startTag = endTag = TAG_CODE
                break
            case 'messageEntityPre':
                startTag = TAG_PRE

                if (entity.language) {
                    startTag += entity.language
                }

                startTag += '\n'
                endTag = '\n' + TAG_PRE
                break
            case 'messageEntityTextUrl':
                startTag = '['
                endTag = `](${entity.url})`
                break
            case 'messageEntityMentionName':
                startTag = '['
                endTag = `](tg://user?id=${entity.userId})`
                break
            case 'messageEntityCustomEmoji':
                startTag = '['
                endTag = `](tg://emoji?id=${entity.documentId.toString()})`
                break
            default:
                continue
        }

        insert.push([start, startTag])
        insert.push([end, endTag])
    }

    // sort by offset desc
    insert.sort((a, b) => b[0] - a[0])

    for (const [offset, tag] of insert) {
        text = text.substr(0, offset) + tag + text.substr(offset)
    }

    return text
}

function parse(
    strings: TemplateStringsArray | string,
    ...sub: (InputText | MessageEntity | boolean | number | undefined | null)[]
): TextWithEntities {
    const entities: tl.TypeMessageEntity[] = []
    let result = ''

    const stacks: Record<string, tl.Mutable<tl.TypeMessageEntity>[]> = {}

    let insideCode = false
    let insidePre = false
    let insideLink = false

    function feed(text: string) {
        const len = text.length
        let pos = 0

        while (pos < len) {
            const c = text[pos]

            if (c === '\\') {
                result += text[pos + 1]
                pos += 2
                continue
            }

            if (insideCode) {
                if (c === '`') {
                    // we can be certain that we're inside code

                    const ent = stacks.code.pop()!
                    ent.length = result.length - ent.offset
                    entities.push(ent)
                    insideCode = false
                    pos += 1
                } else {
                    pos += 1
                    result += c
                }
                continue
            }

            if (insidePre) {
                if (c === '`' || (c === '\n' && text[pos + 1] === '`')) {
                    if (c === '\n') pos += 1

                    if (text[pos + 1] === '`' && text[pos + 2] === '`') {
                        // we can be certain that we're inside pre

                        const ent = stacks.pre.pop()!
                        ent.length = result.length - ent.offset
                        entities.push(ent)
                        insidePre = false
                        pos += 3
                        continue

                        // closed with single or double backtick
                        // i.e. not closed actually! this is totally valid md:
                        // ```javascript
                        // const a = ``;
                        // ```
                        // compensate that `pos` change we made earliers
                    } else if (c === '\n') {
                        pos -= 1
                    }
                }

                pos += 1
                result += c
                continue
            }

            if (insideLink && c === ']') {
                // we can be certain that we're inside link

                const ent = stacks.link.pop()!

                if (text[pos + 1] !== '(') {
                    // [link text]
                    // ignore this, and add opening [
                    result = `${result.substr(0, ent.offset)}[${result.substr(ent.offset)}]`
                    pos += 1
                    insideLink = false
                    continue
                }

                pos += 2
                let url = ''

                while (pos < text.length && text[pos] !== ')') {
                    url += text[pos++]
                }

                pos += 1 // )

                if (pos > text.length) {
                    throw new Error('Malformed LINK entity, expected )')
                }

                if (url.length) {
                    ent.length = result.length - ent.offset

                    let m = url.match(MENTION_REGEX)

                    if (m) {
                        const userId = parseInt(m[1])
                        const accessHash = m[2]

                        if (accessHash) {
                            (ent as tl.Mutable<tl.RawInputMessageEntityMentionName>)._ =
                                'inputMessageEntityMentionName'
                            ;(ent as tl.Mutable<tl.RawInputMessageEntityMentionName>).userId = {
                                _: 'inputUser',
                                userId,
                                accessHash: Long.fromString(accessHash, false, 16),
                            }
                        } else {
                            (ent as tl.Mutable<tl.RawMessageEntityMentionName>)._ = 'messageEntityMentionName'
                            ;(ent as tl.Mutable<tl.RawMessageEntityMentionName>).userId = userId
                        }
                    } else if ((m = EMOJI_REGEX.exec(url))) {
                        (ent as tl.Mutable<tl.RawMessageEntityCustomEmoji>)._ = 'messageEntityCustomEmoji'
                        ;(ent as tl.Mutable<tl.RawMessageEntityCustomEmoji>).documentId = Long.fromString(m[1])
                    } else {
                        if (url.match(/^\/\//)) url = 'http:' + url
                        ;(ent as tl.Mutable<tl.RawMessageEntityTextUrl>)._ = 'messageEntityTextUrl'
                        ;(ent as tl.Mutable<tl.RawMessageEntityTextUrl>).url = url
                    }
                    entities.push(ent)
                }

                insideLink = false
                continue
            }

            if (c === '[' && !insideLink) {
                pos += 1
                insideLink = true
                if (!('link' in stacks)) stacks.link = []
                // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
                stacks.link.push({
                    offset: result.length,
                    length: 0,
                    // eslint-disable-next-line @typescript-eslint/no-explicit-any
                } as any) // other fields are added after the second part
                continue
            }

            if (c === '`') {
                const isPre = text[pos + 1] === '`' && text[pos + 2] === '`'

                if (isPre) {
                    pos += 3
                    let language = ''

                    while (pos < text.length && text[pos] !== '\n') {
                        language += text[pos++]
                    }

                    // newline
                    pos += 1

                    if (pos > text.length) {
                        throw new Error('Malformed PRE entity, expected LF after ```')
                    }

                    if (!('pre' in stacks)) stacks.pre = []
                    stacks.pre.push({
                        _: 'messageEntityPre',
                        offset: result.length,
                        length: 0,
                        language,
                    })
                    insidePre = true
                } else {
                    pos += 1
                    if (!('code' in stacks)) stacks.code = []
                    stacks.code.push({
                        _: 'messageEntityCode',
                        offset: result.length,
                        length: 0,
                    })
                    insideCode = true
                }

                continue
            }

            if (c === text[pos + 1]) {
                // maybe (?) start or end of an entity
                let type: 'Italic' | 'Bold' | 'Underline' | 'Strike' | 'Spoiler' | null = null

                switch (c) {
                    case '_':
                        type = 'Italic'
                        break
                    case '*':
                        type = 'Bold'
                        break
                    case '-':
                        type = 'Underline'
                        break
                    case '~':
                        type = 'Strike'
                        break
                    case '|':
                        type = 'Spoiler'
                        break
                }

                if (type) {
                    if (!(type in stacks)) stacks[type] = []
                    const isBegin = stacks[type].length === 0

                    if (isBegin) {
                        stacks[type].push({
                            _: `messageEntity${type}`,
                            offset: result.length,
                            length: 0,
                        })
                    } else {
                        // valid because isBegin is false

                        const ent = stacks[type].pop()!
                        ent.length = result.length - ent.offset
                        entities.push(ent)
                    }

                    pos += 2
                    continue
                }
            }

            if (c === '\n') {
                if (pos !== 0) {
                    result += '\n'
                }

                const nonWhitespace = text.slice(pos + 1).search(/\S/)

                if (nonWhitespace !== -1) {
                    pos += nonWhitespace + 1
                } else {
                    pos = len
                    result = result.trimEnd()
                }
                continue
            }

            // nothing matched => normal character
            result += c
            pos += 1
        }
    }

    if (typeof strings === 'string') strings = [strings] as unknown as TemplateStringsArray

    sub.forEach((it, idx) => {
        feed(strings[idx])

        if (typeof it === 'boolean' || !it) return

        if (typeof it === 'string' || typeof it === 'number') {
            result += it
        } else {
            // TextWithEntities or MessageEntity
            const text = it.text
            const innerEntities = 'raw' in it ? [it.raw] : it.entities

            const baseOffset = result.length
            result += text

            if (innerEntities) {
                for (const ent of innerEntities) {
                    entities.push({ ...ent, offset: ent.offset + baseOffset })
                }
            }
        }
    })

    feed(strings[strings.length - 1])

    for (const [name, stack] of Object.entries(stacks)) {
        if (stack.length) {
            throw new Error(`Unterminated ${name} entity`)
        }
    }

    return {
        text: result,
        entities,
    }
}

// typedoc doesn't support this yet, so we'll have to do it manually
// https://github.com/TypeStrong/typedoc/issues/2436

export const md: {
    /**
     * Tagged template based Markdown-to-entities parser function
     *
     * Additionally, `md` function has two static methods:
     * - `md.escape` - escape a string to be safely used in Markdown
     *   (should not be needed in most cases, as `md` function itself handles all `string`s
     *   passed to it automatically as plain text)
     * - `md.unparse` - add Markdown formatting to the text given the plain text and entities contained in it
     *
     * @example
     * ```typescript
     * const text = md`**${user.displayName}**`
     * ```
     */
    (
        strings: TemplateStringsArray,
        ...sub: (InputText | MessageEntity | boolean | number | undefined | null)[]
    ): TextWithEntities
    /**
     * A variant taking a plain JS string as input
     * and parsing it.
     *
     * Useful for cases when you already have a string
     * (e.g. from some server) and want to parse it.
     *
     * @example
     * ```typescript
     * const string = '**hello**'
     * const text = md(string)
     * ```
     */
    (string: string): TextWithEntities
    escape: typeof escape
    unparse: typeof unparse
} = Object.assign(parse, {
    escape,
    unparse,
})