From 28baf50958255d69fb0106d2b9ed8d25626c40b0 Mon Sep 17 00:00:00 2001 From: teidesu <86301490+teidesu@users.noreply.github.com> Date: Fri, 6 May 2022 00:05:21 +0300 Subject: [PATCH] feat(html): big rework, process html similar to browsers --- packages/html-parser/README.md | 57 +++---- packages/html-parser/src/index.ts | 93 +++++++---- .../html-parser/tests/html-parser.spec.ts | 149 +++++++++++++++++- 3 files changed, 223 insertions(+), 76 deletions(-) diff --git a/packages/html-parser/README.md b/packages/html-parser/README.md index 85571048..41b904df 100644 --- a/packages/html-parser/README.md +++ b/packages/html-parser/README.md @@ -5,7 +5,7 @@ This package implements formatting syntax based on HTML, similar to the one available in the Bot API ([documented here](https://core.telegram.org/bots/api#html-style)) -> **NOTE**: The syntax implemented here is not entirely compatible with Bot API _HTML_. +> **NOTE**: The syntax implemented here is **incompatible** with Bot API _HTML_. > > Please read [Syntax](#syntax) below for a detailed explanation @@ -20,7 +20,7 @@ tg.registerParseMode(new HtmlMessageEntityParser()) tg.sendText( 'me', - html`Hello, me! Updates from the feed:\n${await getUpdatesFromFeed()}` + html`Hello, me! Updates from the feed:
${await getUpdatesFromFeed()}` ) ``` @@ -30,34 +30,26 @@ tg.sendText( supports nearly any HTML. However, since the text is still processed in a custom way for Telegram, the supported subset of features is documented below: -## Line breaks +## Line breaks and spaces -Line breaks are preserved, `
` are ignored. +Line breaks are **not** preserved, `
` is used instead, +making the syntax very close to the one used when building web pages. -> ⚠️ Warning for **Prettier** users: be aware that Prettier -> formats tagged template literals with `html` as normal HTML and may add -> unwanted line breaks. -> -> Use `htm` instead (which is just an alias): -> ```typescript -> import { htm } from '@mtcute/html-parser' -> -> await msg.answerText(htm`Hello, ${msg.sender.username}`) -> ``` +Multiple spaces and indents are collapsed, when you do need multiple spaces use ` ` instead. ## Inline entities Inline entities are entities that are in-line with other text. We support these entities: -| Name | Code | Result (visual) -|---|---|---| -| Bold | `text` | **text** -| Italic | `text` | _text_ -| Underline | `text` | text -| Strikethrough | `text` | ~~text~~ -| Monospace (code) | `text` | `text` -| Text link | `Google` | [Google](https://google.com) -| Text mention | `Name` | N/A +| Name | Code | Result (visual) | +|------------------|-------------------------------------------|------------------------------| +| Bold | `text` | **text** | +| Italic | `text` | _text_ | +| Underline | `text` | text | +| Strikethrough | `text` | ~~text~~ | +| Monospace (code) | `text` | `text` | +| Text link | `Google` | [Google](https://google.com) | +| Text mention | `Name` | N/A | > **Note**: ``, ``, ``, ``, `` are not supported because they are redundant @@ -82,10 +74,10 @@ Optionally, language for `
` block can be specified like this:
 > However, since syntax highlighting hasn't been implemented in
 > official Telegram clients, this doesn't really matter 🤷‍♀️
 
-| Code | Result (visual)
-|---|---|
-| 
<pre>multiline\ntext</pre>
|
multiline
text
-|
<pre language="javascript">
export default 42
</pre>
|
export default 42
+| Code | Result (visual) | +|-------------------------------------------------------------------------------------|------------------------------| +|
<pre>multiline\ntext</pre>
|
multiline
text
| +|
<pre language="javascript">
export default 42
</pre>
|
export default 42
| ## Nested and overlapped entities @@ -94,12 +86,11 @@ as expected! Overlapping entities are supported in `unparse()`, though. -| Code | Result (visual) -|---|---| -| `Welcome back, User!` | **Welcome back, _User_!** -| `bold and italic` | **bold _and_** italic
⚠️ word "italic" is not actually italic! -| `bold and italic`
⚠️ this is how unparse() handles overlapping entities | ** -bold _and_** _italic_ +| Code | Result (visual) | +|---------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------| +| `Welcome back, User!` | **Welcome back, _User_!** | +| `bold and italic` | **bold _and_** italic
⚠️ word "italic" is not actually italic! | +| `bold and italic`
⚠️ this is how unparse() handles overlapping entities | **bold _and_** _italic_ | ## Escaping diff --git a/packages/html-parser/src/index.ts b/packages/html-parser/src/index.ts index 5e7e2266..4111ebc2 100644 --- a/packages/html-parser/src/index.ts +++ b/packages/html-parser/src/index.ts @@ -7,7 +7,8 @@ import { tl } from '@mtcute/tl' import { Parser } from 'htmlparser2' import Long from 'long' -const MENTION_REGEX = /^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/ +const MENTION_REGEX = + /^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/ /** * Tagged template based helper for escaping entities in HTML @@ -35,28 +36,6 @@ export function html( return { value: str + strings[strings.length - 1], mode: 'html' } } -/** - * Alias for {@link html} for Prettier users. - * - * Prettier formats html`...` as normal HTML, - * thus may add unwanted line breaks. - */ -// eslint-disable-next-line @typescript-eslint/ban-ts-comment -// @ts-ignore -export declare function htm( - strings: TemplateStringsArray, - ...sub: (string | FormattedString)[] -): FormattedString - -/** @internal */ -// eslint-disable-next-line @typescript-eslint/ban-ts-comment -// @ts-ignore -export const htm = html - -// ts ignores above are a hack so the resulting d.ts contains `htm` -// as a function and not a variable, thus the ide would highlight -// it as such (the same way as `html`) - export namespace HtmlMessageEntityParser { /** * Syntax highlighter function used in {@link HtmlMessageEntityParser.unparse} @@ -106,13 +85,45 @@ export class HtmlMessageEntityParser implements IMessageEntityParser { const stacks: Record[]> = {} const entities: tl.TypeMessageEntity[] = [] let plainText = '' + let pendingText = '' + + function processPendingText(tagEnd = false) { + if (!pendingText.length) return + + if (!stacks.pre?.length) { + pendingText = pendingText.replace(/[^\S\u00A0]+/gs, ' ') + + if (tagEnd) pendingText = pendingText.trimEnd() + + if (!plainText.length || plainText.match(/\s$/)) { + pendingText = pendingText.trimStart() + } + } + + for (const ents of Object.values(stacks)) { + for (const ent of ents) { + ent.length += pendingText.length + } + } + + plainText += pendingText + pendingText = '' + } const parser = new Parser({ onopentag(name, attribs) { name = name.toLowerCase() + processPendingText() + + // ignore tags inside pre (except pre) + if (name !== 'pre' && stacks.pre?.length) return + let entity: tl.TypeMessageEntity switch (name) { + case 'br': + plainText += '\n' + return case 'b': case 'strong': entity = { @@ -184,7 +195,11 @@ export class HtmlMessageEntityParser implements IMessageEntityParser { userId: { _: 'inputUser', userId: id, - accessHash: Long.fromString(accessHash, false, 16), + accessHash: Long.fromString( + accessHash, + false, + 16 + ), }, } } else { @@ -216,25 +231,33 @@ export class HtmlMessageEntityParser implements IMessageEntityParser { } stacks[name].push(entity) }, - ontext(data) { - for (const ents of Object.values(stacks)) { - for (const ent of ents) { - ent.length += data.length - } - } - - plainText += data - }, onclosetag(name: string) { + processPendingText(true) + + name = name.toLowerCase() + + // ignore tags inside pre (except pre) + if (name !== 'pre' && stacks.pre?.length) return + const entity = stacks[name]?.pop() + if (!entity) return // unmatched close tag - entities.push(entity) + + // ignore nested pre-s + if (name !== 'pre' || !stacks.pre.length) { + entities.push(entity) + } + }, + ontext(data) { + pendingText += data }, }) parser.write(text) - return [plainText, entities] + processPendingText(true) + + return [plainText.replace(/\u00A0/g, ' '), entities] } unparse(text: string, entities: ReadonlyArray): string { diff --git a/packages/html-parser/tests/html-parser.spec.ts b/packages/html-parser/tests/html-parser.spec.ts index ebde052c..89cd5fff 100644 --- a/packages/html-parser/tests/html-parser.spec.ts +++ b/packages/html-parser/tests/html-parser.spec.ts @@ -3,7 +3,7 @@ import { expect } from 'chai' import { tl } from '@mtcute/tl' import { HtmlMessageEntityParser, html } from '../src' import { MessageEntity, FormattedString } from '@mtcute/client' -import bigInt from 'big-integer' +import Long from 'long' const createEntity = ( type: T, @@ -257,6 +257,22 @@ describe('HtmlMessageEntityParser', () => { parser ) }) + + it('should replace newlines with
', () => { + test( + 'plain\n\nplain', + [], + 'plain

plain' + ) + }) + + it('should replace multiple spaces with  ', () => { + test( + 'plain plain', + [], + 'plain    plain' + ) + }) }) describe('parse', () => { @@ -316,7 +332,7 @@ describe('HtmlMessageEntityParser', () => { userId: { _: 'inputUser', userId: 1234567, - accessHash: bigInt('aabbccddaabbccdd', 16), + accessHash: Long.fromString('aabbccddaabbccdd', 16), }, }), ], @@ -337,6 +353,111 @@ describe('HtmlMessageEntityParser', () => { ) }) + it('should ignore other tags inside
', () => {
+            test(
+                '
bold and not bold
', + [createEntity('messageEntityPre', 0, 17, { language: '' })], + 'bold and not bold' + ) + test( + '
pre inside pre
so cool
', + [createEntity('messageEntityPre', 0, 22, { language: '' })], + 'pre inside pre so cool' + ) + }) + + it('should ignore newlines and indentation', () => { + test( + 'this is some text\n\nwith newlines', + [], + 'this is some text with newlines' + ) + test( + 'this is some text\n\nwith newlines', + [createEntity('messageEntityBold', 0, 22)], + 'this is some text with newlines' + ) + test( + 'this is some text ending with\n\n newlines', + [createEntity('messageEntityBold', 0, 29)], + 'this is some text ending with newlines' + ) + test( + ` + this is some indented text + with newlines and + + indented tags + yeah so cool + + `, + [ + createEntity('messageEntityBold', 45, 13), + createEntity('messageEntityItalic', 64, 7), + ], + 'this is some indented text with newlines and indented tags yeah so cool' + ) + }) + + it('should not ignore newlines and indentation in pre', () => { + test( + '
this is some text\n\nwith newlines
', + [createEntity('messageEntityPre', 0, 32, { language: '' })], + 'this is some text\n\nwith newlines' + ) + + // fuck my life + const indent = ' ' + test( + `
+                this  is  some  indented  text
+                with    newlines     and
+                
+                    indented tags
+                 yeah so cool
+                
+                
`, + [createEntity('messageEntityPre', 0, 203, { language: '' })], + '\n' + + indent + + 'this is some indented text\n' + + indent + + 'with newlines and\n' + + indent + + '\n' + + indent + + ' indented tags\n' + + indent + + ' yeah so cool\n' + + indent + + '\n' + + indent + ) + }) + + it('should handle
', () => { + test( + 'this is some text

with actual newlines', + [], + 'this is some text\n\nwith actual newlines' + ) + test( + 'this is some text

with actual newlines', + // note that the
(i.e. \n) is not included in the entity + // this is expected, and the result is the same + [createEntity('messageEntityBold', 0, 17)], + 'this is some text\n\nwith actual newlines' + ) + }) + + it('should handle  ', () => { + test( + 'one space, many    spaces, and
a newline', + [], + 'one space, many spaces, and\na newline' + ) + }) + it('should support entities on the edges', () => { test( 'Hello, world', @@ -456,9 +577,15 @@ describe('HtmlMessageEntityParser', () => { const unsafeString = '<&>' expect(html`${unsafeString}`.value).eq('<&>') - expect(html`${unsafeString} text`.value).eq('<&> text') - expect(html`text ${unsafeString}`.value).eq('text <&>') - expect(html`${unsafeString}`.value).eq('<&>') + expect(html`${unsafeString} text`.value).eq( + '<&> text' + ) + expect(html`text ${unsafeString}`.value).eq( + 'text <&>' + ) + expect(html`${unsafeString}`.value).eq( + '<&>' + ) }) it('should skip with FormattedString', () => { @@ -467,10 +594,16 @@ describe('HtmlMessageEntityParser', () => { expect(html`${unsafeString}`.value).eq('<&>') expect(html`${unsafeString} ${unsafeString2}`.value).eq('<&> <&>') - expect(html`${unsafeString} text`.value).eq('<&> text') - expect(html`text ${unsafeString}`.value).eq('text <&>') + expect(html`${unsafeString} text`.value).eq( + '<&> text' + ) + expect(html`text ${unsafeString}`.value).eq( + 'text <&>' + ) expect(html`${unsafeString}`.value).eq('<&>') - expect(html`${unsafeString} ${unsafeString2}`.value).eq('<&> <&>') + expect(html`${unsafeString} ${unsafeString2}`.value).eq( + '<&> <&>' + ) }) it('should error with incompatible FormattedString', () => {