feat(html): big rework, process html similar to browsers

2022-05-06 00:05:21 +03:00 · 2022-05-06 00:05:21 +03:00 · 28baf50958
commit 28baf50958
parent d031388ea2
3 changed files with 223 additions and 76 deletions
--- a/packages/html-parser/README.md
+++ b/packages/html-parser/README.md
@ -5,7 +5,7 @@
 This package implements formatting syntax based on HTML, similar to the one available in the Bot
 API ([documented here](https://core.telegram.org/bots/api#html-style))
-> **NOTE**: The syntax implemented here is not entirely compatible with Bot API _HTML_.
+> **NOTE**: The syntax implemented here is **incompatible** with Bot API _HTML_.
 >
 > Please read [Syntax](#syntax) below for a detailed explanation
@ -20,7 +20,7 @@ tg.registerParseMode(new HtmlMessageEntityParser())
 tg.sendText(
    'me',
-    html`Hello, <b>me</b>! Updates from the feed:\n${await getUpdatesFromFeed()}`
+    html`Hello, <b>me</b>! Updates from the feed:<br>${await getUpdatesFromFeed()}`
 )
 ```
@ -30,34 +30,26 @@ tg.sendText(
 supports nearly any HTML. However, since the text is still processed in a custom way for Telegram, the supported subset
 of features is documented below:
-## Line breaks
+## Line breaks and spaces
-Line breaks are preserved, `<br>` are ignored.
+Line breaks are **not** preserved, `<br>` is used instead,
 making the syntax very close to the one used when building web pages.
-> ⚠️ Warning for **Prettier** users: be aware that Prettier
+Multiple spaces and indents are collapsed, when you do need multiple spaces use `&nbsp;` instead.
 > formats tagged template literals with `html` as normal HTML and may add
 > unwanted line breaks.
 >
 > Use `htm` instead (which is just an alias):
 > ```typescript
 > import { htm } from '@mtcute/html-parser'
 >
 > await msg.answerText(htm`Hello, <b>${msg.sender.username}</b>`)
 > ```
 ## Inline entities
 Inline entities are entities that are in-line with other text. We support these entities:
-| Name | Code | Result (visual)
+| Name             | Code                                      | Result (visual)              |
-|---|---|---|
+|------------------|-------------------------------------------|------------------------------|
-| Bold | `<b>text</b>` | **text**
+| Bold             | `<b>text</b>`                             | **text**                     |
-| Italic | `<b>text</b>` | _text_
+| Italic           | `<b>text</b>`                             | _text_                       |
-| Underline | `<u>text</u>` | <u>text</u>
+| Underline        | `<u>text</u>`                             | <u>text</u>                  |
-| Strikethrough | `<s>text</s>` | ~~text~~
+| Strikethrough    | `<s>text</s>`                             | ~~text~~                     |
-| Monospace (code) | `<code>text</code>` | `text`
+| Monospace (code) | `<code>text</code>`                       | `text`                       |
-| Text link | `<a href="https://google.com">Google</a>` | [Google](https://google.com)
+| Text link        | `<a href="https://google.com">Google</a>` | [Google](https://google.com) |
-| Text mention | `<a href="tg://user?id=1234567">Name</a>` | N/A
+| Text mention     | `<a href="tg://user?id=1234567">Name</a>` | N/A                          |
 > **Note**: `<strong>`, `<em>`, `<ins>`, `<strike>`, `<del>` are not supported because they are redundant
@ -82,10 +74,10 @@ Optionally, language for `<pre>` block can be specified like this:
 > However, since syntax highlighting hasn't been implemented in
 > official Telegram clients, this doesn't really matter 🤷‍♀️
-| Code | Result (visual)
+| Code                                                                                | Result (visual)              |
-|---|---|
+|-------------------------------------------------------------------------------------|------------------------------|
-| <pre>&lt;pre&gt;multiline\ntext&lt;/pre&gt;</pre> | <pre>multiline<br>text</pre>
+| <pre>&lt;pre&gt;multiline\ntext&lt;/pre&gt;</pre>                                   | <pre>multiline<br>text</pre> |
-| <pre>&lt;pre language="javascript"&gt;<br>  export default 42<br>&lt;/pre&gt;</pre> | <pre>export default 42</pre>
+| <pre>&lt;pre language="javascript"&gt;<br>  export default 42<br>&lt;/pre&gt;</pre> | <pre>export default 42</pre> |
 ## Nested and overlapped entities
@ -94,12 +86,11 @@ as expected!
 Overlapping entities are supported in `unparse()`, though.
-| Code | Result (visual)
+| Code                                                                                                                | Result (visual)                                                          |
-|---|---|
+|---------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
-| `<b>Welcome back, <i>User</i>!</b>` | **Welcome back, _User_!**
+| `<b>Welcome back, <i>User</i>!</b>`                                                                                 | **Welcome back, _User_!**                                                |
-| `<b>bold <i>and</b> italic</i>` | **bold _and_** italic<br>⚠️ <i>word "italic" is not actually italic!</i>
+| `<b>bold <i>and</b> italic</i>`                                                                                     | **bold _and_** italic<br>⚠️ <i>word "italic" is not actually italic!</i> |
-| `<b>bold <i>and</i></b><i> italic</i>`<br>⚠️ <i>this is how <code>unparse()</code> handles overlapping entities</i> | **
+| `<b>bold <i>and</i></b><i> italic</i>`<br>⚠️ <i>this is how <code>unparse()</code> handles overlapping entities</i> | **bold _and_** _italic_                                                  |
 bold _and_** _italic_
 ## Escaping
--- a/packages/html-parser/src/index.ts
+++ b/packages/html-parser/src/index.ts
@ -7,7 +7,8 @@ import { tl } from '@mtcute/tl'
 import { Parser } from 'htmlparser2'
 import Long from 'long'
-const MENTION_REGEX = /^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/
+const MENTION_REGEX =
    /^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/
 /**
 * Tagged template based helper for escaping entities in HTML
@ -35,28 +36,6 @@ export function html(
    return { value: str + strings[strings.length - 1], mode: 'html' }
 }
 /**
 * Alias for {@link html} for Prettier users.
 *
 * Prettier formats <code>html`...`</code> as normal HTML,
 * thus may add unwanted line breaks.
 */
 // eslint-disable-next-line @typescript-eslint/ban-ts-comment
 // @ts-ignore
 export declare function htm(
    strings: TemplateStringsArray,
    ...sub: (string | FormattedString)[]
 ): FormattedString
 /** @internal */
 // eslint-disable-next-line @typescript-eslint/ban-ts-comment
 // @ts-ignore
 export const htm = html
 // ts ignores above are a hack so the resulting d.ts contains `htm`
 // as a function and not a variable, thus the ide would highlight
 // it as such (the same way as `html`)
 export namespace HtmlMessageEntityParser {
    /**
     * Syntax highlighter function used in {@link HtmlMessageEntityParser.unparse}
@ -106,13 +85,45 @@ export class HtmlMessageEntityParser implements IMessageEntityParser {
        const stacks: Record<string, tl.Mutable<tl.TypeMessageEntity>[]> = {}
        const entities: tl.TypeMessageEntity[] = []
        let plainText = ''
        let pendingText = ''
        function processPendingText(tagEnd = false) {
            if (!pendingText.length) return
            if (!stacks.pre?.length) {
                pendingText = pendingText.replace(/[^\S\u00A0]+/gs, ' ')
                if (tagEnd) pendingText = pendingText.trimEnd()
                if (!plainText.length || plainText.match(/\s$/)) {
                    pendingText = pendingText.trimStart()
                }
            }
            for (const ents of Object.values(stacks)) {
                for (const ent of ents) {
                    ent.length += pendingText.length
                }
            }
            plainText += pendingText
            pendingText = ''
        }
        const parser = new Parser({
            onopentag(name, attribs) {
                name = name.toLowerCase()
                processPendingText()
                // ignore tags inside pre (except pre)
                if (name !== 'pre' && stacks.pre?.length) return
                let entity: tl.TypeMessageEntity
                switch (name) {
                    case 'br':
                        plainText += '\n'
                        return
                    case 'b':
                    case 'strong':
                        entity = {
@ -184,7 +195,11 @@ export class HtmlMessageEntityParser implements IMessageEntityParser {
                                    userId: {
                                        _: 'inputUser',
                                        userId: id,
-                                        accessHash: Long.fromString(accessHash, false, 16),
+                                        accessHash: Long.fromString(
                                            accessHash,
                                            false,
                                            16
                                        ),
                                    },
                                }
                            } else {
@ -216,25 +231,33 @@ export class HtmlMessageEntityParser implements IMessageEntityParser {
                }
                stacks[name].push(entity)
            },
            ontext(data) {
                for (const ents of Object.values(stacks)) {
                    for (const ent of ents) {
                        ent.length += data.length
                    }
                }
                plainText += data
            },
            onclosetag(name: string) {
                processPendingText(true)
                name = name.toLowerCase()
                // ignore tags inside pre (except pre)
                if (name !== 'pre' && stacks.pre?.length) return
                const entity = stacks[name]?.pop()
                if (!entity) return // unmatched close tag
                // ignore nested pre-s
                if (name !== 'pre' || !stacks.pre.length) {
                    entities.push(entity)
                }
            },
            ontext(data) {
                pendingText += data
            },
        })
        parser.write(text)
-        return [plainText, entities]
+        processPendingText(true)
        return [plainText.replace(/\u00A0/g, ' '), entities]
    }
    unparse(text: string, entities: ReadonlyArray<MessageEntity>): string {
--- a/packages/html-parser/tests/html-parser.spec.ts
+++ b/packages/html-parser/tests/html-parser.spec.ts
@ -3,7 +3,7 @@ import { expect } from 'chai'
 import { tl } from '@mtcute/tl'
 import { HtmlMessageEntityParser, html } from '../src'
 import { MessageEntity, FormattedString } from '@mtcute/client'
-import bigInt from 'big-integer'
+import Long from 'long'
 const createEntity = <T extends tl.TypeMessageEntity['_']>(
    type: T,
@ -257,6 +257,22 @@ describe('HtmlMessageEntityParser', () => {
                parser
            )
        })
        it('should replace newlines with <br>', () => {
            test(
                'plain\n\nplain',
                [],
                'plain<br><br>plain'
            )
        })
        it('should replace multiple spaces with &nbsp;', () => {
            test(
                'plain    plain',
                [],
                'plain&nbsp;&nbsp;&nbsp;&nbsp;plain'
            )
        })
    })
    describe('parse', () => {
@ -316,7 +332,7 @@ describe('HtmlMessageEntityParser', () => {
                        userId: {
                            _: 'inputUser',
                            userId: 1234567,
-                            accessHash: bigInt('aabbccddaabbccdd', 16),
+                            accessHash: Long.fromString('aabbccddaabbccdd', 16),
                        },
                    }),
                ],
@ -337,6 +353,111 @@ describe('HtmlMessageEntityParser', () => {
            )
        })
        it('should ignore other tags inside <pre>', () => {
            test(
                '<pre><b>bold</b> and not bold</pre>',
                [createEntity('messageEntityPre', 0, 17, { language: '' })],
                'bold and not bold'
            )
            test(
                '<pre><pre>pre inside pre</pre> so cool</pre>',
                [createEntity('messageEntityPre', 0, 22, { language: '' })],
                'pre inside pre so cool'
            )
        })
        it('should ignore newlines and indentation', () => {
            test(
                'this is some text\n\nwith newlines',
                [],
                'this is some text with newlines'
            )
            test(
                '<b>this is some text\n\nwith</b> newlines',
                [createEntity('messageEntityBold', 0, 22)],
                'this is some text with newlines'
            )
            test(
                '<b>this is some text ending with\n\n</b> newlines',
                [createEntity('messageEntityBold', 0, 29)],
                'this is some text ending with newlines'
            )
            test(
                `
                this  is  some  indented  text
                with    newlines     and
                <b>
                    indented tags
                </b> yeah <i>so cool
                </i>
                `,
                [
                    createEntity('messageEntityBold', 45, 13),
                    createEntity('messageEntityItalic', 64, 7),
                ],
                'this is some indented text with newlines and indented tags yeah so cool'
            )
        })
        it('should not ignore newlines and indentation in pre', () => {
            test(
                '<pre>this is some text\n\nwith newlines</pre>',
                [createEntity('messageEntityPre', 0, 32, { language: '' })],
                'this is some text\n\nwith newlines'
            )
            // fuck my life
            const indent = '                '
            test(
                `<pre>
                this  is  some  indented  text
                with    newlines     and
                <b>
                    indented tags
                </b> yeah <i>so cool
                </i>
                </pre>`,
                [createEntity('messageEntityPre', 0, 203, { language: '' })],
                '\n' +
                    indent +
                    'this  is  some  indented  text\n' +
                    indent +
                    'with    newlines     and\n' +
                    indent +
                    '\n' +
                    indent +
                    '    indented tags\n' +
                    indent +
                    ' yeah so cool\n' +
                    indent +
                    '\n' +
                    indent
            )
        })
        it('should handle <br>', () => {
            test(
                'this is some text<br><br>with actual newlines',
                [],
                'this is some text\n\nwith actual newlines'
            )
            test(
                '<b>this is some text<br><br></b>with actual newlines',
                // note that the <br> (i.e. \n) is not included in the entity
                // this is expected, and the result is the same
                [createEntity('messageEntityBold', 0, 17)],
                'this is some text\n\nwith actual newlines'
            )
        })
        it('should handle &nbsp;', () => {
            test(
                'one    space, many&nbsp;&nbsp;&nbsp;&nbsp;spaces, and<br>a newline',
                [],
                'one space, many    spaces, and\na newline'
            )
        })
        it('should support entities on the edges', () => {
            test(
                '<b>Hello</b>, <b>world</b>',
@ -456,9 +577,15 @@ describe('HtmlMessageEntityParser', () => {
            const unsafeString = '<&>'
            expect(html`${unsafeString}`.value).eq('&lt;&amp;&gt;')
-            expect(html`${unsafeString} <b>text</b>`.value).eq('&lt;&amp;&gt; <b>text</b>')
+            expect(html`${unsafeString} <b>text</b>`.value).eq(
-            expect(html`<b>text</b> ${unsafeString}`.value).eq('<b>text</b> &lt;&amp;&gt;')
+                '&lt;&amp;&gt; <b>text</b>'
-            expect(html`<b>${unsafeString}</b>`.value).eq('<b>&lt;&amp;&gt;</b>')
+            )
            expect(html`<b>text</b> ${unsafeString}`.value).eq(
                '<b>text</b> &lt;&amp;&gt;'
            )
            expect(html`<b>${unsafeString}</b>`.value).eq(
                '<b>&lt;&amp;&gt;</b>'
            )
        })
        it('should skip with FormattedString', () => {
@ -467,10 +594,16 @@ describe('HtmlMessageEntityParser', () => {
            expect(html`${unsafeString}`.value).eq('<&>')
            expect(html`${unsafeString} ${unsafeString2}`.value).eq('<&> &lt;&amp;&gt;')
-            expect(html`${unsafeString} <b>text</b>`.value).eq('<&> <b>text</b>')
+            expect(html`${unsafeString} <b>text</b>`.value).eq(
-            expect(html`<b>text</b> ${unsafeString}`.value).eq('<b>text</b> <&>')
+                '<&> <b>text</b>'
            )
            expect(html`<b>text</b> ${unsafeString}`.value).eq(
                '<b>text</b> <&>'
            )
            expect(html`<b>${unsafeString}</b>`.value).eq('<b><&></b>')
-            expect(html`<b>${unsafeString} ${unsafeString2}</b>`.value).eq('<b><&> &lt;&amp;&gt;</b>')
+            expect(html`<b>${unsafeString} ${unsafeString2}</b>`.value).eq(
                '<b><&> &lt;&amp;&gt;</b>'
            )
        })
        it('should error with incompatible FormattedString', () => {