feat(html): big rework, process html similar to browsers

This commit is contained in:
teidesu 2022-05-06 00:05:21 +03:00
parent d031388ea2
commit 28baf50958
3 changed files with 223 additions and 76 deletions

View file

@ -5,7 +5,7 @@
This package implements formatting syntax based on HTML, similar to the one available in the Bot
API ([documented here](https://core.telegram.org/bots/api#html-style))
> **NOTE**: The syntax implemented here is not entirely compatible with Bot API _HTML_.
> **NOTE**: The syntax implemented here is **incompatible** with Bot API _HTML_.
>
> Please read [Syntax](#syntax) below for a detailed explanation
@ -20,7 +20,7 @@ tg.registerParseMode(new HtmlMessageEntityParser())
tg.sendText(
'me',
html`Hello, <b>me</b>! Updates from the feed:\n${await getUpdatesFromFeed()}`
html`Hello, <b>me</b>! Updates from the feed:<br>${await getUpdatesFromFeed()}`
)
```
@ -30,34 +30,26 @@ tg.sendText(
supports nearly any HTML. However, since the text is still processed in a custom way for Telegram, the supported subset
of features is documented below:
## Line breaks
## Line breaks and spaces
Line breaks are preserved, `<br>` are ignored.
Line breaks are **not** preserved, `<br>` is used instead,
making the syntax very close to the one used when building web pages.
> ⚠️ Warning for **Prettier** users: be aware that Prettier
> formats tagged template literals with `html` as normal HTML and may add
> unwanted line breaks.
>
> Use `htm` instead (which is just an alias):
> ```typescript
> import { htm } from '@mtcute/html-parser'
>
> await msg.answerText(htm`Hello, <b>${msg.sender.username}</b>`)
> ```
Multiple spaces and indents are collapsed, when you do need multiple spaces use `&nbsp;` instead.
## Inline entities
Inline entities are entities that are in-line with other text. We support these entities:
| Name | Code | Result (visual)
|---|---|---|
| Bold | `<b>text</b>` | **text**
| Italic | `<b>text</b>` | _text_
| Underline | `<u>text</u>` | <u>text</u>
| Strikethrough | `<s>text</s>` | ~~text~~
| Monospace (code) | `<code>text</code>` | `text`
| Text link | `<a href="https://google.com">Google</a>` | [Google](https://google.com)
| Text mention | `<a href="tg://user?id=1234567">Name</a>` | N/A
| Name | Code | Result (visual) |
|------------------|-------------------------------------------|------------------------------|
| Bold | `<b>text</b>` | **text** |
| Italic | `<b>text</b>` | _text_ |
| Underline | `<u>text</u>` | <u>text</u> |
| Strikethrough | `<s>text</s>` | ~~text~~ |
| Monospace (code) | `<code>text</code>` | `text` |
| Text link | `<a href="https://google.com">Google</a>` | [Google](https://google.com) |
| Text mention | `<a href="tg://user?id=1234567">Name</a>` | N/A |
> **Note**: `<strong>`, `<em>`, `<ins>`, `<strike>`, `<del>` are not supported because they are redundant
@ -82,10 +74,10 @@ Optionally, language for `<pre>` block can be specified like this:
> However, since syntax highlighting hasn't been implemented in
> official Telegram clients, this doesn't really matter 🤷‍♀️
| Code | Result (visual)
|---|---|
| <pre>&lt;pre&gt;multiline\ntext&lt;/pre&gt;</pre> | <pre>multiline<br>text</pre>
| <pre>&lt;pre language="javascript"&gt;<br> export default 42<br>&lt;/pre&gt;</pre> | <pre>export default 42</pre>
| Code | Result (visual) |
|-------------------------------------------------------------------------------------|------------------------------|
| <pre>&lt;pre&gt;multiline\ntext&lt;/pre&gt;</pre> | <pre>multiline<br>text</pre> |
| <pre>&lt;pre language="javascript"&gt;<br> export default 42<br>&lt;/pre&gt;</pre> | <pre>export default 42</pre> |
## Nested and overlapped entities
@ -94,12 +86,11 @@ as expected!
Overlapping entities are supported in `unparse()`, though.
| Code | Result (visual)
|---|---|
| `<b>Welcome back, <i>User</i>!</b>` | **Welcome back, _User_!**
| `<b>bold <i>and</b> italic</i>` | **bold _and_** italic<br>⚠️ <i>word "italic" is not actually italic!</i>
| `<b>bold <i>and</i></b><i> italic</i>`<br>⚠️ <i>this is how <code>unparse()</code> handles overlapping entities</i> | **
bold _and_** _italic_
| Code | Result (visual) |
|---------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
| `<b>Welcome back, <i>User</i>!</b>` | **Welcome back, _User_!** |
| `<b>bold <i>and</b> italic</i>` | **bold _and_** italic<br>⚠️ <i>word "italic" is not actually italic!</i> |
| `<b>bold <i>and</i></b><i> italic</i>`<br>⚠️ <i>this is how <code>unparse()</code> handles overlapping entities</i> | **bold _and_** _italic_ |
## Escaping

View file

@ -7,7 +7,8 @@ import { tl } from '@mtcute/tl'
import { Parser } from 'htmlparser2'
import Long from 'long'
const MENTION_REGEX = /^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/
const MENTION_REGEX =
/^tg:\/\/user\?id=(\d+)(?:&hash=(-?[0-9a-fA-F]+)(?:&|$)|&|$)/
/**
* Tagged template based helper for escaping entities in HTML
@ -35,28 +36,6 @@ export function html(
return { value: str + strings[strings.length - 1], mode: 'html' }
}
/**
* Alias for {@link html} for Prettier users.
*
* Prettier formats <code>html`...`</code> as normal HTML,
* thus may add unwanted line breaks.
*/
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
export declare function htm(
strings: TemplateStringsArray,
...sub: (string | FormattedString)[]
): FormattedString
/** @internal */
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
export const htm = html
// ts ignores above are a hack so the resulting d.ts contains `htm`
// as a function and not a variable, thus the ide would highlight
// it as such (the same way as `html`)
export namespace HtmlMessageEntityParser {
/**
* Syntax highlighter function used in {@link HtmlMessageEntityParser.unparse}
@ -106,13 +85,45 @@ export class HtmlMessageEntityParser implements IMessageEntityParser {
const stacks: Record<string, tl.Mutable<tl.TypeMessageEntity>[]> = {}
const entities: tl.TypeMessageEntity[] = []
let plainText = ''
let pendingText = ''
function processPendingText(tagEnd = false) {
if (!pendingText.length) return
if (!stacks.pre?.length) {
pendingText = pendingText.replace(/[^\S\u00A0]+/gs, ' ')
if (tagEnd) pendingText = pendingText.trimEnd()
if (!plainText.length || plainText.match(/\s$/)) {
pendingText = pendingText.trimStart()
}
}
for (const ents of Object.values(stacks)) {
for (const ent of ents) {
ent.length += pendingText.length
}
}
plainText += pendingText
pendingText = ''
}
const parser = new Parser({
onopentag(name, attribs) {
name = name.toLowerCase()
processPendingText()
// ignore tags inside pre (except pre)
if (name !== 'pre' && stacks.pre?.length) return
let entity: tl.TypeMessageEntity
switch (name) {
case 'br':
plainText += '\n'
return
case 'b':
case 'strong':
entity = {
@ -184,7 +195,11 @@ export class HtmlMessageEntityParser implements IMessageEntityParser {
userId: {
_: 'inputUser',
userId: id,
accessHash: Long.fromString(accessHash, false, 16),
accessHash: Long.fromString(
accessHash,
false,
16
),
},
}
} else {
@ -216,25 +231,33 @@ export class HtmlMessageEntityParser implements IMessageEntityParser {
}
stacks[name].push(entity)
},
ontext(data) {
for (const ents of Object.values(stacks)) {
for (const ent of ents) {
ent.length += data.length
}
}
plainText += data
},
onclosetag(name: string) {
processPendingText(true)
name = name.toLowerCase()
// ignore tags inside pre (except pre)
if (name !== 'pre' && stacks.pre?.length) return
const entity = stacks[name]?.pop()
if (!entity) return // unmatched close tag
entities.push(entity)
// ignore nested pre-s
if (name !== 'pre' || !stacks.pre.length) {
entities.push(entity)
}
},
ontext(data) {
pendingText += data
},
})
parser.write(text)
return [plainText, entities]
processPendingText(true)
return [plainText.replace(/\u00A0/g, ' '), entities]
}
unparse(text: string, entities: ReadonlyArray<MessageEntity>): string {

View file

@ -3,7 +3,7 @@ import { expect } from 'chai'
import { tl } from '@mtcute/tl'
import { HtmlMessageEntityParser, html } from '../src'
import { MessageEntity, FormattedString } from '@mtcute/client'
import bigInt from 'big-integer'
import Long from 'long'
const createEntity = <T extends tl.TypeMessageEntity['_']>(
type: T,
@ -257,6 +257,22 @@ describe('HtmlMessageEntityParser', () => {
parser
)
})
it('should replace newlines with <br>', () => {
test(
'plain\n\nplain',
[],
'plain<br><br>plain'
)
})
it('should replace multiple spaces with &nbsp;', () => {
test(
'plain plain',
[],
'plain&nbsp;&nbsp;&nbsp;&nbsp;plain'
)
})
})
describe('parse', () => {
@ -316,7 +332,7 @@ describe('HtmlMessageEntityParser', () => {
userId: {
_: 'inputUser',
userId: 1234567,
accessHash: bigInt('aabbccddaabbccdd', 16),
accessHash: Long.fromString('aabbccddaabbccdd', 16),
},
}),
],
@ -337,6 +353,111 @@ describe('HtmlMessageEntityParser', () => {
)
})
it('should ignore other tags inside <pre>', () => {
test(
'<pre><b>bold</b> and not bold</pre>',
[createEntity('messageEntityPre', 0, 17, { language: '' })],
'bold and not bold'
)
test(
'<pre><pre>pre inside pre</pre> so cool</pre>',
[createEntity('messageEntityPre', 0, 22, { language: '' })],
'pre inside pre so cool'
)
})
it('should ignore newlines and indentation', () => {
test(
'this is some text\n\nwith newlines',
[],
'this is some text with newlines'
)
test(
'<b>this is some text\n\nwith</b> newlines',
[createEntity('messageEntityBold', 0, 22)],
'this is some text with newlines'
)
test(
'<b>this is some text ending with\n\n</b> newlines',
[createEntity('messageEntityBold', 0, 29)],
'this is some text ending with newlines'
)
test(
`
this is some indented text
with newlines and
<b>
indented tags
</b> yeah <i>so cool
</i>
`,
[
createEntity('messageEntityBold', 45, 13),
createEntity('messageEntityItalic', 64, 7),
],
'this is some indented text with newlines and indented tags yeah so cool'
)
})
it('should not ignore newlines and indentation in pre', () => {
test(
'<pre>this is some text\n\nwith newlines</pre>',
[createEntity('messageEntityPre', 0, 32, { language: '' })],
'this is some text\n\nwith newlines'
)
// fuck my life
const indent = ' '
test(
`<pre>
this is some indented text
with newlines and
<b>
indented tags
</b> yeah <i>so cool
</i>
</pre>`,
[createEntity('messageEntityPre', 0, 203, { language: '' })],
'\n' +
indent +
'this is some indented text\n' +
indent +
'with newlines and\n' +
indent +
'\n' +
indent +
' indented tags\n' +
indent +
' yeah so cool\n' +
indent +
'\n' +
indent
)
})
it('should handle <br>', () => {
test(
'this is some text<br><br>with actual newlines',
[],
'this is some text\n\nwith actual newlines'
)
test(
'<b>this is some text<br><br></b>with actual newlines',
// note that the <br> (i.e. \n) is not included in the entity
// this is expected, and the result is the same
[createEntity('messageEntityBold', 0, 17)],
'this is some text\n\nwith actual newlines'
)
})
it('should handle &nbsp;', () => {
test(
'one space, many&nbsp;&nbsp;&nbsp;&nbsp;spaces, and<br>a newline',
[],
'one space, many spaces, and\na newline'
)
})
it('should support entities on the edges', () => {
test(
'<b>Hello</b>, <b>world</b>',
@ -456,9 +577,15 @@ describe('HtmlMessageEntityParser', () => {
const unsafeString = '<&>'
expect(html`${unsafeString}`.value).eq('&lt;&amp;&gt;')
expect(html`${unsafeString} <b>text</b>`.value).eq('&lt;&amp;&gt; <b>text</b>')
expect(html`<b>text</b> ${unsafeString}`.value).eq('<b>text</b> &lt;&amp;&gt;')
expect(html`<b>${unsafeString}</b>`.value).eq('<b>&lt;&amp;&gt;</b>')
expect(html`${unsafeString} <b>text</b>`.value).eq(
'&lt;&amp;&gt; <b>text</b>'
)
expect(html`<b>text</b> ${unsafeString}`.value).eq(
'<b>text</b> &lt;&amp;&gt;'
)
expect(html`<b>${unsafeString}</b>`.value).eq(
'<b>&lt;&amp;&gt;</b>'
)
})
it('should skip with FormattedString', () => {
@ -467,10 +594,16 @@ describe('HtmlMessageEntityParser', () => {
expect(html`${unsafeString}`.value).eq('<&>')
expect(html`${unsafeString} ${unsafeString2}`.value).eq('<&> &lt;&amp;&gt;')
expect(html`${unsafeString} <b>text</b>`.value).eq('<&> <b>text</b>')
expect(html`<b>text</b> ${unsafeString}`.value).eq('<b>text</b> <&>')
expect(html`${unsafeString} <b>text</b>`.value).eq(
'<&> <b>text</b>'
)
expect(html`<b>text</b> ${unsafeString}`.value).eq(
'<b>text</b> <&>'
)
expect(html`<b>${unsafeString}</b>`.value).eq('<b><&></b>')
expect(html`<b>${unsafeString} ${unsafeString2}</b>`.value).eq('<b><&> &lt;&amp;&gt;</b>')
expect(html`<b>${unsafeString} ${unsafeString2}</b>`.value).eq(
'<b><&> &lt;&amp;&gt;</b>'
)
})
it('should error with incompatible FormattedString', () => {