From eec142f0e50b7c74bc9105237c87e3786ef54e10 Mon Sep 17 00:00:00 2001
From: Alina Sireneva <alina@tei.su>
Date: Sat, 4 Nov 2023 06:44:18 +0300
Subject: [PATCH] =?UTF-8?q?feat:=20wasm!=20=F0=9F=9A=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/core/package.json                    |    2 +-
 packages/core/src/base-client.ts              |    1 +
 packages/core/src/network/auth-key.ts         |   13 +-
 packages/core/src/network/authorization.ts    |   10 +-
 .../core/src/network/session-connection.ts    |   78 +-
 .../core/src/network/transports/obfuscated.ts |   28 +-
 packages/core/src/utils/crypto/abstract.ts    |   31 +-
 packages/core/src/utils/crypto/common.ts      |   67 -
 packages/core/src/utils/crypto/node-crypto.ts |   64 -
 packages/core/src/utils/crypto/node.ts        |   93 +
 .../src/utils/crypto/{subtle.ts => web.ts}    |   71 +-
 packages/core/src/utils/platform/crypto.ts    |    2 +-
 .../core/src/utils/platform/crypto.web.ts     |    4 +-
 packages/core/tests/auth-key.spec.ts          |    2 +-
 packages/core/tests/crypto-providers.spec.ts  |   86 +-
 packages/core/tests/keys.spec.ts              |    2 +-
 packages/core/tests/mtproto-crypto.spec.ts    |    2 +-
 packages/crypto-node/.gitignore               |    2 +-
 packages/crypto-node/src/index.ts             |    4 +-
 packages/crypto/README.md                     |    5 -
 packages/crypto/cbc256.c                      |   36 -
 packages/crypto/cbc256.h                      |   17 -
 packages/crypto/ctr256.c                      |   32 -
 packages/crypto/ctr256.h                      |    8 -
 packages/tl-runtime/package.json              |   10 +-
 packages/tl-runtime/src/encodings/gzip.ts     |   39 -
 packages/tl-runtime/src/encodings/gzip.web.ts |   40 -
 packages/tl-runtime/src/encodings/index.ts    |    1 -
 packages/tl-runtime/src/index.ts              |    1 -
 packages/tl-runtime/src/reader.ts             |   10 +-
 packages/tl/scripts/gen-rsa-keys.ts           |    2 +-
 packages/wasm/.gitignore                      |    1 +
 packages/wasm/README.md                       |   19 +
 packages/wasm/build.config.cjs                |   20 +
 packages/wasm/lib/Dockerfile                  |   14 +
 packages/wasm/lib/Makefile                    |   75 +
 packages/wasm/lib/common_defs.h               |  686 +++
 packages/wasm/lib/crypto/COPYING.lesser       |  165 +
 packages/{ => wasm/lib}/crypto/aes256.c       |    0
 packages/{ => wasm/lib}/crypto/aes256.h       |    5 +-
 packages/wasm/lib/crypto/ctr256.c             |   55 +
 packages/wasm/lib/crypto/ctr256.h             |    6 +
 packages/{ => wasm/lib}/crypto/ige256.c       |    4 +-
 packages/{ => wasm/lib}/crypto/ige256.h       |    0
 packages/wasm/lib/lib_common.h                |   62 +
 packages/wasm/lib/libdeflate.h                |  245 +
 packages/wasm/lib/libdeflate/COPYING          |   21 +
 packages/wasm/lib/libdeflate/adler32.c        |  123 +
 packages/wasm/lib/libdeflate/adler32.h        |    8 +
 packages/wasm/lib/libdeflate/bt_matchfinder.h |  342 ++
 .../wasm/lib/libdeflate/decompress_template.h |  777 ++++
 .../wasm/lib/libdeflate/deflate_compress.c    | 4119 +++++++++++++++++
 .../wasm/lib/libdeflate/deflate_compress.h    |   20 +
 .../wasm/lib/libdeflate/deflate_constants.h   |   56 +
 .../wasm/lib/libdeflate/deflate_decompress.c  | 1200 +++++
 .../wasm/lib/libdeflate/deflate_decompress.h  |   14 +
 packages/wasm/lib/libdeflate/gzip_constants.h |   45 +
 .../wasm/lib/libdeflate/gzip_decompress.c     |  160 +
 packages/wasm/lib/libdeflate/hc_matchfinder.h |  401 ++
 packages/wasm/lib/libdeflate/ht_matchfinder.h |  234 +
 .../wasm/lib/libdeflate/matchfinder_common.h  |  194 +
 packages/wasm/lib/libdeflate/zlib_compress.c  |   83 +
 packages/wasm/lib/libdeflate/zlib_constants.h |   21 +
 packages/wasm/lib/mtcute.wasm                 |  Bin 0 -> 45120 bytes
 packages/wasm/lib/utils.c                     |  137 +
 packages/wasm/package.json                    |   29 +
 packages/wasm/src/index.ts                    |  213 +
 packages/wasm/src/init.ts                     |   24 +
 packages/wasm/src/init.web.ts                 |   42 +
 packages/wasm/src/types.ts                    |   24 +
 packages/wasm/tests/allocator.spec.ts         |   21 +
 packages/wasm/tests/ctr.spec.ts               |  149 +
 packages/wasm/tests/gunzip.spec.ts            |   46 +
 packages/wasm/tests/ige.spec.ts               |   40 +
 packages/wasm/tests/tsconfig.json             |    9 +
 packages/wasm/tests/zlib.spec.ts              |   49 +
 packages/wasm/tsconfig.json                   |   10 +
 packages/wasm/typedoc.cjs                     |    4 +
 pnpm-lock.yaml                                |   27 +-
 scripts/build-package.js                      |    4 -
 80 files changed, 10231 insertions(+), 535 deletions(-)
 delete mode 100644 packages/core/src/utils/crypto/common.ts
 delete mode 100644 packages/core/src/utils/crypto/node-crypto.ts
 create mode 100644 packages/core/src/utils/crypto/node.ts
 rename packages/core/src/utils/crypto/{subtle.ts => web.ts} (54%)
 delete mode 100644 packages/crypto/README.md
 delete mode 100644 packages/crypto/cbc256.c
 delete mode 100644 packages/crypto/cbc256.h
 delete mode 100644 packages/crypto/ctr256.c
 delete mode 100644 packages/crypto/ctr256.h
 delete mode 100644 packages/tl-runtime/src/encodings/gzip.ts
 delete mode 100644 packages/tl-runtime/src/encodings/gzip.web.ts
 create mode 100644 packages/wasm/.gitignore
 create mode 100644 packages/wasm/README.md
 create mode 100644 packages/wasm/build.config.cjs
 create mode 100644 packages/wasm/lib/Dockerfile
 create mode 100644 packages/wasm/lib/Makefile
 create mode 100644 packages/wasm/lib/common_defs.h
 create mode 100644 packages/wasm/lib/crypto/COPYING.lesser
 rename packages/{ => wasm/lib}/crypto/aes256.c (100%)
 rename packages/{ => wasm/lib}/crypto/aes256.h (84%)
 create mode 100644 packages/wasm/lib/crypto/ctr256.c
 create mode 100644 packages/wasm/lib/crypto/ctr256.h
 rename packages/{ => wasm/lib}/crypto/ige256.c (84%)
 rename packages/{ => wasm/lib}/crypto/ige256.h (100%)
 create mode 100644 packages/wasm/lib/lib_common.h
 create mode 100644 packages/wasm/lib/libdeflate.h
 create mode 100644 packages/wasm/lib/libdeflate/COPYING
 create mode 100644 packages/wasm/lib/libdeflate/adler32.c
 create mode 100644 packages/wasm/lib/libdeflate/adler32.h
 create mode 100644 packages/wasm/lib/libdeflate/bt_matchfinder.h
 create mode 100644 packages/wasm/lib/libdeflate/decompress_template.h
 create mode 100644 packages/wasm/lib/libdeflate/deflate_compress.c
 create mode 100644 packages/wasm/lib/libdeflate/deflate_compress.h
 create mode 100644 packages/wasm/lib/libdeflate/deflate_constants.h
 create mode 100644 packages/wasm/lib/libdeflate/deflate_decompress.c
 create mode 100644 packages/wasm/lib/libdeflate/deflate_decompress.h
 create mode 100644 packages/wasm/lib/libdeflate/gzip_constants.h
 create mode 100644 packages/wasm/lib/libdeflate/gzip_decompress.c
 create mode 100644 packages/wasm/lib/libdeflate/hc_matchfinder.h
 create mode 100644 packages/wasm/lib/libdeflate/ht_matchfinder.h
 create mode 100644 packages/wasm/lib/libdeflate/matchfinder_common.h
 create mode 100644 packages/wasm/lib/libdeflate/zlib_compress.c
 create mode 100644 packages/wasm/lib/libdeflate/zlib_constants.h
 create mode 100755 packages/wasm/lib/mtcute.wasm
 create mode 100644 packages/wasm/lib/utils.c
 create mode 100644 packages/wasm/package.json
 create mode 100644 packages/wasm/src/index.ts
 create mode 100644 packages/wasm/src/init.ts
 create mode 100644 packages/wasm/src/init.web.ts
 create mode 100644 packages/wasm/src/types.ts
 create mode 100644 packages/wasm/tests/allocator.spec.ts
 create mode 100644 packages/wasm/tests/ctr.spec.ts
 create mode 100644 packages/wasm/tests/gunzip.spec.ts
 create mode 100644 packages/wasm/tests/ige.spec.ts
 create mode 100644 packages/wasm/tests/tsconfig.json
 create mode 100644 packages/wasm/tests/zlib.spec.ts
 create mode 100644 packages/wasm/tsconfig.json
 create mode 100644 packages/wasm/typedoc.cjs

diff --git a/packages/core/package.json b/packages/core/package.json
index e11ecdf9..078cde3c 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -46,13 +46,13 @@
     "dependencies": {
         "@mtcute/tl": "workspace:^",
         "@mtcute/tl-runtime": "workspace:^",
+        "@mtcute/wasm": "workspace:^",
         "@types/events": "3.0.0",
         "big-integer": "1.6.51",
         "events": "3.2.0",
         "long": "5.2.3"
     },
     "devDependencies": {
-        "@cryptography/aes": "^0.1.1",
         "@types/ws": "8.5.4",
         "node-forge": "1.3.1",
         "ws": "8.13.0"
diff --git a/packages/core/src/base-client.ts b/packages/core/src/base-client.ts
index 16da46d3..93b54246 100644
--- a/packages/core/src/base-client.ts
+++ b/packages/core/src/base-client.ts
@@ -361,6 +361,7 @@ export class BaseTelegramClient extends EventEmitter {
 
         const promise = (this._connected = createControllablePromise())
 
+        await this.crypto.initialize?.()
         await this._loadStorage()
         const primaryDc = await this.storage.getDefaultDcs()
         if (primaryDc !== null) this._defaultDcs = primaryDc
diff --git a/packages/core/src/network/auth-key.ts b/packages/core/src/network/auth-key.ts
index deb12791..af6ec415 100644
--- a/packages/core/src/network/auth-key.ts
+++ b/packages/core/src/network/auth-key.ts
@@ -5,7 +5,14 @@ import { TlBinaryReader, TlReaderMap } from '@mtcute/tl-runtime'
 
 import { MtcuteError } from '../types/errors.js'
 import { createAesIgeForMessage } from '../utils/crypto/mtproto.js'
-import { buffersEqual, concatBuffers, dataViewFromBuffer, ICryptoProvider, Logger, randomBytes } from '../utils/index.js'
+import {
+    buffersEqual,
+    concatBuffers,
+    dataViewFromBuffer,
+    ICryptoProvider,
+    Logger,
+    randomBytes,
+} from '../utils/index.js'
 
 export class AuthKey {
     ready = false
@@ -55,7 +62,7 @@ export class AuthKey {
 
         const messageKey = (await this._crypto.sha256(concatBuffers([this.clientSalt, buf]))).subarray(8, 24)
         const ige = await createAesIgeForMessage(this._crypto, this.key, messageKey, true)
-        const encryptedData = await ige.encrypt(buf)
+        const encryptedData = ige.encrypt(buf)
 
         return concatBuffers([this.id, messageKey, encryptedData])
     }
@@ -78,7 +85,7 @@ export class AuthKey {
         }
 
         const ige = await createAesIgeForMessage(this._crypto, this.key, messageKey, false)
-        const innerData = await ige.decrypt(encryptedData)
+        const innerData = ige.decrypt(encryptedData)
 
         const msgKeySource = await this._crypto.sha256(concatBuffers([this.serverSalt, innerData]))
         const expectedMessageKey = msgKeySource.subarray(8, 24)
diff --git a/packages/core/src/network/authorization.ts b/packages/core/src/network/authorization.ts
index 7b184a5e..82f7a39b 100644
--- a/packages/core/src/network/authorization.ts
+++ b/packages/core/src/network/authorization.ts
@@ -141,8 +141,8 @@ async function rsaPad(data: Uint8Array, crypto: ICryptoProvider, key: TlPublicKe
         // we only need to reverse the data
         dataWithHash.subarray(0, 192).reverse()
 
-        const aes = await crypto.createAesIge(aesKey, aesIv)
-        const encrypted = await aes.encrypt(dataWithHash)
+        const aes = crypto.createAesIge(aesKey, aesIv)
+        const encrypted = aes.encrypt(dataWithHash)
         const encryptedHash = await crypto.sha256(encrypted)
 
         xorBufferInPlace(aesKey, encryptedHash)
@@ -300,9 +300,9 @@ export async function doAuthorization(
 
     // Step 3: complete DH exchange
     const [key, iv] = await generateKeyAndIvFromNonce(crypto, resPq.serverNonce, newNonce)
-    const ige = await crypto.createAesIge(key, iv)
+    const ige = crypto.createAesIge(key, iv)
 
-    const plainTextAnswer = await ige.decrypt(serverDhParams.encryptedAnswer)
+    const plainTextAnswer = ige.decrypt(serverDhParams.encryptedAnswer)
     const innerDataHash = plainTextAnswer.subarray(0, 20)
     const serverDhInnerReader = new TlBinaryReader(readerMap, plainTextAnswer, 20)
     const serverDhInner = serverDhInnerReader.object() as mtp.TlObject
@@ -379,7 +379,7 @@ export async function doAuthorization(
 
         log.debug('sending client DH (timeOffset = %d)', timeOffset)
 
-        const clientDhEncrypted = await ige.encrypt(clientDhInnerWriter.uint8View)
+        const clientDhEncrypted = ige.encrypt(clientDhInnerWriter.uint8View)
         await sendPlainMessage({
             _: 'mt_set_client_DH_params',
             nonce,
diff --git a/packages/core/src/network/session-connection.ts b/packages/core/src/network/session-connection.ts
index e3e4afee..db8a4e36 100644
--- a/packages/core/src/network/session-connection.ts
+++ b/packages/core/src/network/session-connection.ts
@@ -3,15 +3,7 @@
 import Long from 'long'
 
 import { mtp, tl } from '@mtcute/tl'
-import {
-    gzipDeflate,
-    gzipInflate,
-    TlBinaryReader,
-    TlBinaryWriter,
-    TlReaderMap,
-    TlSerializationCounter,
-    TlWriterMap,
-} from '@mtcute/tl-runtime'
+import { TlBinaryReader, TlBinaryWriter, TlReaderMap, TlSerializationCounter, TlWriterMap } from '@mtcute/tl-runtime'
 
 import { MtArgumentError, MtcuteError, MtTimeoutError } from '../types/index.js'
 import { createAesIgeForMessageOld } from '../utils/crypto/mtproto.js'
@@ -20,6 +12,7 @@ import {
     ControllablePromise,
     createControllablePromise,
     EarlyTimer,
+    ICryptoProvider,
     longFromBuffer,
     randomBytes,
     randomLong,
@@ -51,6 +44,12 @@ export interface SessionConnectionParams extends PersistentConnectionParams {
 
 // destroy_auth_key#d1435160 = DestroyAuthKeyRes;
 // const DESTROY_AUTH_KEY = Buffer.from('605134d1', 'hex')
+// gzip_packed#3072cfa1 packed_data:string = Object;
+const GZIP_PACKED_ID = 0x3072cfa1
+// msg_container#73f1f8dc messages:vector<%Message> = MessageContainer;
+const MSG_CONTAINER_ID = 0x73f1f8dc
+// rpc_result#f35c6d01 req_msg_id:long result:Object = RpcResult;
+const RPC_RESULT_ID = 0xf35c6d01
 
 function makeNiceStack(error: tl.RpcError, stack: string, method?: string) {
     error.stack = `RpcError (${error.code} ${error.text}): ${error.message}\n    at ${method}\n${stack
@@ -80,6 +79,7 @@ export class SessionConnection extends PersistentConnection {
 
     private _readerMap: TlReaderMap
     private _writerMap: TlWriterMap
+    private _crypto: ICryptoProvider
 
     constructor(
         params: SessionConnectionParams,
@@ -90,6 +90,7 @@ export class SessionConnection extends PersistentConnection {
 
         this._readerMap = params.readerMap
         this._writerMap = params.writerMap
+        this._crypto = params.crypto
         this._handleRawMessage = this._handleRawMessage.bind(this)
     }
 
@@ -265,7 +266,7 @@ export class SessionConnection extends PersistentConnection {
         this._session.authorizationPending = true
         this.emit('auth-begin')
 
-        doAuthorization(this, this.params.crypto)
+        doAuthorization(this, this._crypto)
             .then(async ([authKey, serverSalt, timeOffset]) => {
                 await this._session._authKey.setup(authKey)
                 this._session.serverSalt = serverSalt
@@ -312,7 +313,7 @@ export class SessionConnection extends PersistentConnection {
             this._isPfsBindingPending = true
         }
 
-        doAuthorization(this, this.params.crypto, TEMP_AUTH_KEY_EXPIRY)
+        doAuthorization(this, this._crypto, TEMP_AUTH_KEY_EXPIRY)
             .then(async ([tempAuthKey, tempServerSalt]) => {
                 if (!this._usePfs) {
                     this.log.info('pfs has been disabled while generating temp key')
@@ -357,16 +358,11 @@ export class SessionConnection extends PersistentConnection {
                 writer.raw(randomBytes(8))
                 const msgWithPadding = writer.result()
 
-                const hash = await this.params.crypto.sha1(msgWithoutPadding)
+                const hash = await this._crypto.sha1(msgWithoutPadding)
                 const msgKey = hash.subarray(4, 20)
 
-                const ige = await createAesIgeForMessageOld(
-                    this.params.crypto,
-                    this._session._authKey.key,
-                    msgKey,
-                    true,
-                )
-                const encryptedData = await ige.encrypt(msgWithPadding)
+                const ige = await createAesIgeForMessageOld(this._crypto, this._session._authKey.key, msgKey, true)
+                const encryptedData = ige.encrypt(msgWithPadding)
                 const encryptedMessage = concatBuffers([this._session._authKey.id, msgKey, encryptedData])
 
                 const promise = createControllablePromise<mtp.RawMt_rpc_error | boolean>()
@@ -512,22 +508,17 @@ export class SessionConnection extends PersistentConnection {
     }
 
     private _handleRawMessage(messageId: Long, seqNo: number, message: TlBinaryReader): void {
-        if (message.peekUint() === 0x3072cfa1) {
-            // gzip_packed
-            // we can't use message.gzip() because it may contain msg_container,
-            // so we parse it manually.
-            message.uint()
+        const objectId = message.uint()
 
+        if (objectId === GZIP_PACKED_ID) {
             return this._handleRawMessage(
                 messageId,
                 seqNo,
-                new TlBinaryReader(this._readerMap, gzipInflate(message.bytes())),
+                new TlBinaryReader(this._readerMap, this._crypto.gunzip(message.bytes())),
             )
         }
 
-        if (message.peekUint() === 0x73f1f8dc) {
-            // msg_container
-            message.uint()
+        if (objectId === MSG_CONTAINER_ID) {
             const count = message.uint()
 
             for (let i = 0; i < count; i++) {
@@ -545,15 +536,12 @@ export class SessionConnection extends PersistentConnection {
             return
         }
 
-        if (message.peekUint() === 0xf35c6d01) {
-            // rpc_result
-            message.uint()
-
+        if (objectId === RPC_RESULT_ID) {
             return this._onRpcResult(messageId, message)
         }
 
         // we are safe.. i guess
-        this._handleMessage(messageId, message.object())
+        this._handleMessage(messageId, message.object(objectId))
     }
 
     private _handleMessage(messageId: Long, message_: unknown): void {
@@ -729,7 +717,22 @@ export class SessionConnection extends PersistentConnection {
         const rpc = msg.rpc
 
         const customReader = this._readerMap._results![rpc.method]
-        const result: any = customReader ? customReader(message) : message.object()
+
+        let result: any
+
+        if (customReader) {
+            result = customReader(message)
+        } else {
+            const objectId = message.uint()
+
+            if (objectId === GZIP_PACKED_ID) {
+                const inner = this._crypto.gunzip(message.bytes())
+                // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
+                result = TlBinaryReader.deserializeObject(this._readerMap, inner)
+            } else {
+                result = message.object(objectId)
+            }
+        }
 
         // initConnection call was definitely received and
         // processed by the server, so we no longer need to use it
@@ -1262,13 +1265,14 @@ export class SessionConnection extends PersistentConnection {
             // if it is less than 0.9, then try to compress the whole request
 
             const middle = ~~((content.length - 1024) / 2)
-            const gzipped = gzipDeflate(content.subarray(middle, middle + 1024), 0.9)
+            const middlePart = content.subarray(middle, middle + 1024)
+            const gzipped = this._crypto.gzip(middlePart, Math.floor(middlePart.length * 0.9))
 
             if (!gzipped) shouldGzip = false
         }
 
         if (shouldGzip) {
-            const gzipped = gzipDeflate(content, 0.9)
+            const gzipped = this._crypto.gzip(content, Math.floor(content.length * 0.9))
 
             if (gzipped) {
                 this.log.debug('gzipped %s (%db -> %db)', method, content.length, gzipped.length)
@@ -1601,7 +1605,7 @@ export class SessionConnection extends PersistentConnection {
             // leave bytes for mtproto header (we'll write it later,
             // since we need seqno and msg_id to be larger than the content)
             writer.pos += 16
-            writer.uint(0x73f1f8dc) // msg_container
+            writer.uint(MSG_CONTAINER_ID)
             writer.uint(messageCount)
         }
 
diff --git a/packages/core/src/network/transports/obfuscated.ts b/packages/core/src/network/transports/obfuscated.ts
index b9631125..b62a2f7a 100644
--- a/packages/core/src/network/transports/obfuscated.ts
+++ b/packages/core/src/network/transports/obfuscated.ts
@@ -1,5 +1,5 @@
 import { concatBuffers, dataViewFromBuffer } from '../../utils/buffer-utils.js'
-import { IEncryptionScheme, randomBytes } from '../../utils/index.js'
+import { IAesCtr, randomBytes } from '../../utils/index.js'
 import { IPacketCodec } from './abstract.js'
 import { WrappedCodec } from './wrapped.js'
 
@@ -11,8 +11,8 @@ export interface MtProxyInfo {
 }
 
 export class ObfuscatedPacketCodec extends WrappedCodec implements IPacketCodec {
-    private _encryptor?: IEncryptionScheme
-    private _decryptor?: IEncryptionScheme
+    private _encryptor?: IAesCtr
+    private _decryptor?: IAesCtr
 
     private _proxy?: MtProxyInfo
 
@@ -78,31 +78,31 @@ export class ObfuscatedPacketCodec extends WrappedCodec implements IPacketCodec
             decryptKey = await this._crypto.sha256(concatBuffers([decryptKey, this._proxy.secret]))
         }
 
-        this._encryptor = await this._crypto.createAesCtr(encryptKey, encryptIv, true)
-        this._decryptor = await this._crypto.createAesCtr(decryptKey, decryptIv, false)
+        this._encryptor = this._crypto.createAesCtr(encryptKey, encryptIv, true)
+        this._decryptor = this._crypto.createAesCtr(decryptKey, decryptIv, false)
 
-        const encrypted = await this._encryptor.encrypt(random)
+        const encrypted = this._encryptor.process(random)
         random.set(encrypted.subarray(56, 64), 56)
 
         return random
     }
 
     async encode(packet: Uint8Array): Promise<Uint8Array> {
-        return this._encryptor!.encrypt(await this._inner.encode(packet))
+        return this._encryptor!.process(await this._inner.encode(packet))
     }
 
     feed(data: Uint8Array): void {
-        const dec = this._decryptor!.decrypt(data)
+        const dec = this._decryptor!.process(data)
 
-        if (ArrayBuffer.isView(dec)) this._inner.feed(dec)
-        else {
-            dec.then((dec) => this._inner.feed(dec)).catch((err) => this.emit('error', err))
-        }
+        this._inner.feed(dec)
     }
 
     reset(): void {
         this._inner.reset()
-        delete this._encryptor
-        delete this._decryptor
+        this._encryptor?.close?.()
+        this._decryptor?.close?.()
+
+        this._encryptor = undefined
+        this._decryptor = undefined
     }
 }
diff --git a/packages/core/src/utils/crypto/abstract.ts b/packages/core/src/utils/crypto/abstract.ts
index c3b0fde8..041c074a 100644
--- a/packages/core/src/utils/crypto/abstract.ts
+++ b/packages/core/src/utils/crypto/abstract.ts
@@ -1,11 +1,14 @@
 import { MaybeAsync } from '../../types/index.js'
-import { AesModeOfOperationIge } from './common.js'
 import { factorizePQSync } from './factorization.js'
 
 export interface IEncryptionScheme {
-    encrypt(data: Uint8Array): MaybeAsync<Uint8Array>
+    encrypt(data: Uint8Array): Uint8Array
+    decrypt(data: Uint8Array): Uint8Array
+}
 
-    decrypt(data: Uint8Array): MaybeAsync<Uint8Array>
+export interface IAesCtr {
+    process(data: Uint8Array): Uint8Array
+    close?(): void
 }
 
 export interface ICryptoProvider {
@@ -25,32 +28,20 @@ export interface ICryptoProvider {
 
     hmacSha256(data: Uint8Array, key: Uint8Array): MaybeAsync<Uint8Array>
 
-    // in telegram, iv is always either used only once, or is the same for all calls for the key
-    createAesCtr(key: Uint8Array, iv: Uint8Array, encrypt: boolean): MaybeAsync<IEncryptionScheme>
+    createAesCtr(key: Uint8Array, iv: Uint8Array, encrypt: boolean): IAesCtr
 
-    createAesIge(key: Uint8Array, iv: Uint8Array): MaybeAsync<IEncryptionScheme>
-
-    createAesEcb(key: Uint8Array): MaybeAsync<IEncryptionScheme>
+    createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme
 
     factorizePQ(pq: Uint8Array): MaybeAsync<[Uint8Array, Uint8Array]>
+
+    gzip(data: Uint8Array, maxSize: number): Uint8Array | null
+    gunzip(data: Uint8Array): Uint8Array
 }
 
 export abstract class BaseCryptoProvider {
-    createAesIge(key: Uint8Array, iv: Uint8Array): MaybeAsync<IEncryptionScheme> {
-        const ecb = this.createAesEcb(key)
-
-        if ('then' in ecb) {
-            return ecb.then((ecb) => new AesModeOfOperationIge(key, iv, ecb))
-        }
-
-        return new AesModeOfOperationIge(key, iv, ecb)
-    }
-
     factorizePQ(pq: Uint8Array) {
         return factorizePQSync(pq)
     }
-
-    abstract createAesEcb(key: Uint8Array): MaybeAsync<IEncryptionScheme>
 }
 
 export type CryptoProviderFactory = () => ICryptoProvider
diff --git a/packages/core/src/utils/crypto/common.ts b/packages/core/src/utils/crypto/common.ts
deleted file mode 100644
index b8dfb83f..00000000
--- a/packages/core/src/utils/crypto/common.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-import type { IEncryptionScheme } from './abstract.js'
-import { xorBufferInPlace } from './utils.js'
-
-/**
- * AES mode of operation IGE implementation in JS
- */
-export class AesModeOfOperationIge implements IEncryptionScheme {
-    private _key: Uint8Array
-    private _iv: Uint8Array
-    private _aes: IEncryptionScheme
-
-    constructor(key: Uint8Array, iv: Uint8Array, ecb: IEncryptionScheme) {
-        this._key = key
-        this._iv = iv
-        this._aes = ecb
-    }
-
-    async encrypt(data: Uint8Array): Promise<Uint8Array> {
-        if (data.length % 16 !== 0) {
-            throw new Error('invalid plaintext size (must be multiple of 16 bytes)')
-        }
-
-        const ciphertext = new Uint8Array(data.length)
-        let block = new Uint8Array(16)
-
-        let iv1 = this._iv.subarray(0, 16)
-        let iv2 = this._iv.subarray(16, 32)
-
-        for (let i = 0; i < data.length; i += 16) {
-            block.set(data.subarray(i, i + 16))
-            xorBufferInPlace(block, iv1)
-            block = await this._aes.encrypt(block)
-            xorBufferInPlace(block, iv2)
-            ciphertext.set(block, i)
-
-            iv1 = ciphertext.subarray(i, i + 16)
-            iv2 = data.subarray(i, i + 16)
-        }
-
-        return ciphertext
-    }
-
-    async decrypt(data: Uint8Array): Promise<Uint8Array> {
-        if (data.length % 16 !== 0) {
-            throw new Error('invalid ciphertext size (must be multiple of 16 bytes)')
-        }
-
-        const plaintext = new Uint8Array(data.length)
-        let block = new Uint8Array(16)
-
-        let iv1 = this._iv.subarray(16, 32)
-        let iv2 = this._iv.subarray(0, 16)
-
-        for (let i = 0; i < data.length; i += 16) {
-            block.set(data.subarray(i, i + 16))
-            xorBufferInPlace(block, iv1)
-            block = await this._aes.decrypt(block)
-            xorBufferInPlace(block, iv2)
-            plaintext.set(block, i)
-
-            iv1 = plaintext.subarray(i, i + 16)
-            iv2 = data.subarray(i, i + 16)
-        }
-
-        return plaintext
-    }
-}
diff --git a/packages/core/src/utils/crypto/node-crypto.ts b/packages/core/src/utils/crypto/node-crypto.ts
deleted file mode 100644
index df94acf5..00000000
--- a/packages/core/src/utils/crypto/node-crypto.ts
+++ /dev/null
@@ -1,64 +0,0 @@
-// eslint-disable-next-line no-restricted-imports
-import { createCipheriv, createDecipheriv, createHash, createHmac, pbkdf2 } from 'crypto'
-
-import { MaybeAsync } from '../../types/index.js'
-import { concatBuffers } from '../buffer-utils.js'
-import { BaseCryptoProvider, ICryptoProvider, IEncryptionScheme } from './abstract.js'
-
-export class NodeCryptoProvider extends BaseCryptoProvider implements ICryptoProvider {
-    createAesCtr(key: Uint8Array, iv: Uint8Array, encrypt: boolean): IEncryptionScheme {
-        const cipher = (encrypt ? createCipheriv : createDecipheriv)(`aes-${key.length * 8}-ctr`, key, iv)
-
-        const update = (data: Uint8Array) => cipher.update(data)
-
-        return {
-            encrypt: update,
-            decrypt: update,
-        }
-    }
-
-    createAesEcb(key: Uint8Array): IEncryptionScheme {
-        const methodName = `aes-${key.length * 8}-ecb`
-
-        return {
-            encrypt(data: Uint8Array) {
-                const cipher = createCipheriv(methodName, key, null)
-                cipher.setAutoPadding(false)
-
-                return concatBuffers([cipher.update(data), cipher.final()])
-            },
-            decrypt(data: Uint8Array) {
-                const cipher = createDecipheriv(methodName, key, null)
-                cipher.setAutoPadding(false)
-
-                return concatBuffers([cipher.update(data), cipher.final()])
-            },
-        }
-    }
-
-    pbkdf2(
-        password: Uint8Array,
-        salt: Uint8Array,
-        iterations: number,
-        keylen = 64,
-        algo = 'sha512',
-    ): MaybeAsync<Uint8Array> {
-        return new Promise((resolve, reject) =>
-            pbkdf2(password, salt, iterations, keylen, algo, (err: Error | null, buf: Uint8Array) =>
-                err !== null ? reject(err) : resolve(buf),
-            ),
-        )
-    }
-
-    sha1(data: Uint8Array): Uint8Array {
-        return createHash('sha1').update(data).digest()
-    }
-
-    sha256(data: Uint8Array): Uint8Array {
-        return createHash('sha256').update(data).digest()
-    }
-
-    hmacSha256(data: Uint8Array, key: Uint8Array): MaybeAsync<Uint8Array> {
-        return createHmac('sha256', key).update(data).digest()
-    }
-}
diff --git a/packages/core/src/utils/crypto/node.ts b/packages/core/src/utils/crypto/node.ts
new file mode 100644
index 00000000..e5d11fd1
--- /dev/null
+++ b/packages/core/src/utils/crypto/node.ts
@@ -0,0 +1,93 @@
+// eslint-disable-next-line no-restricted-imports
+import { createCipheriv, createHash, createHmac, pbkdf2 } from 'crypto'
+import { deflateSync, gunzipSync } from 'zlib'
+
+import { ige256Decrypt, ige256Encrypt, initAsync, InitInput } from '@mtcute/wasm'
+
+import { MaybeAsync } from '../../types/index.js'
+import { BaseCryptoProvider, IAesCtr, ICryptoProvider, IEncryptionScheme } from './abstract.js'
+
+export abstract class BaseNodeCryptoProvider extends BaseCryptoProvider {
+    createAesCtr(key: Uint8Array, iv: Uint8Array): IAesCtr {
+        const cipher = createCipheriv(`aes-${key.length * 8}-ctr`, key, iv)
+
+        const update = (data: Uint8Array) => cipher.update(data)
+
+        return {
+            process: update,
+        }
+    }
+
+    pbkdf2(
+        password: Uint8Array,
+        salt: Uint8Array,
+        iterations: number,
+        keylen = 64,
+        algo = 'sha512',
+    ): MaybeAsync<Uint8Array> {
+        return new Promise((resolve, reject) =>
+            pbkdf2(password, salt, iterations, keylen, algo, (err: Error | null, buf: Uint8Array) =>
+                err !== null ? reject(err) : resolve(buf),
+            ),
+        )
+    }
+
+    sha1(data: Uint8Array): Uint8Array {
+        return createHash('sha1').update(data).digest()
+    }
+
+    sha256(data: Uint8Array): Uint8Array {
+        return createHash('sha256').update(data).digest()
+    }
+
+    hmacSha256(data: Uint8Array, key: Uint8Array): Uint8Array {
+        return createHmac('sha256', key).update(data).digest()
+    }
+
+    gzip(data: Uint8Array, maxSize: number): Uint8Array | null {
+        // todo: test if wasm impl is better fit here
+        try {
+            // telegram accepts both zlib and gzip, but zlib is faster and has less overhead, so we use it here
+            return deflateSync(data, {
+                maxOutputLength: maxSize,
+            })
+            // hot path, avoid additional runtime checks
+            // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        } catch (e: any) {
+            if (e.code === 'ERR_BUFFER_TOO_LARGE') {
+                return null
+            }
+
+            throw e
+        }
+    }
+
+    gunzip(data: Uint8Array): Uint8Array {
+        // todo: test if wasm impl is better fit here
+        return gunzipSync(data)
+    }
+}
+
+export class NodeCryptoProvider extends BaseNodeCryptoProvider implements ICryptoProvider {
+    private wasmInput?: InitInput
+
+    constructor(params?: { wasmInput?: InitInput }) {
+        super()
+        this.wasmInput = params?.wasmInput
+    }
+
+    initialize(): Promise<void> {
+        return initAsync(this.wasmInput)
+    }
+
+    createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme {
+        return {
+            encrypt(data: Uint8Array): Uint8Array {
+                return ige256Encrypt(data, key, iv)
+            },
+            decrypt(data: Uint8Array): Uint8Array {
+                return ige256Decrypt(data, key, iv)
+            },
+        }
+    }
+}
diff --git a/packages/core/src/utils/crypto/subtle.ts b/packages/core/src/utils/crypto/web.ts
similarity index 54%
rename from packages/core/src/utils/crypto/subtle.ts
rename to packages/core/src/utils/crypto/web.ts
index 498f3553..89f1050f 100644
--- a/packages/core/src/utils/crypto/subtle.ts
+++ b/packages/core/src/utils/crypto/web.ts
@@ -1,12 +1,17 @@
+import {
+    createCtr256,
+    ctr256,
+    deflateMaxSize,
+    freeCtr256,
+    gunzip,
+    ige256Decrypt,
+    ige256Encrypt,
+    initAsync,
+    InitInput,
+} from '@mtcute/wasm'
+
 import { MaybeAsync } from '../../index.js'
-import { BaseCryptoProvider, ICryptoProvider, IEncryptionScheme } from './abstract.js'
-
-import AES_, { CTR } from '@cryptography/aes'
-
-// fucking weird flex with es modules.
-// i hate default imports please for the love of god never use them
-type AES_ = typeof AES_.default
-const AES = 'default' in AES_ ? AES_.default : AES_ as AES_
+import { BaseCryptoProvider, IAesCtr, ICryptoProvider, IEncryptionScheme } from './abstract.js'
 
 const ALGO_TO_SUBTLE: Record<string, string> = {
     sha256: 'SHA-256',
@@ -14,23 +19,23 @@ const ALGO_TO_SUBTLE: Record<string, string> = {
     sha512: 'SHA-512',
 }
 
-function wordsToBytes(words: Uint32Array): Uint8Array {
-    const o = new Uint8Array(words.byteLength)
+export class WebCryptoProvider extends BaseCryptoProvider implements ICryptoProvider {
+    readonly subtle: SubtleCrypto
+    readonly wasmInput?: InitInput
 
-    const len = words.length * 4
+    constructor(params?: { wasmInput?: InitInput; subtle?: SubtleCrypto }) {
+        super()
+        this.wasmInput = params?.wasmInput
+        const subtle = params?.subtle ?? globalThis.crypto?.subtle
 
-    for (let i = 0; i < len; ++i) {
-        o[i] = ((words[i >>> 2] >>> (24 - (i % 4) * 8)) & 0xff)
+        if (!subtle) {
+            throw new Error('SubtleCrypto is not available')
+        }
+        this.subtle = subtle
     }
 
-    return o
-}
-
-export class SubtleCryptoProvider extends BaseCryptoProvider implements ICryptoProvider {
-    constructor(
-        readonly subtle: SubtleCrypto,
-    ) {
-        super()
+    initialize(): Promise<void> {
+        return initAsync(this.wasmInput)
     }
 
     sha1(data: Uint8Array): MaybeAsync<Uint8Array> {
@@ -78,21 +83,27 @@ export class SubtleCryptoProvider extends BaseCryptoProvider implements ICryptoP
         return new Uint8Array(res)
     }
 
-    createAesCtr(key: Uint8Array, iv: Uint8Array): IEncryptionScheme {
-        const aes = new CTR(key, iv)
+    createAesCtr(key: Uint8Array, iv: Uint8Array): IAesCtr {
+        const ctx = createCtr256(key, iv)
 
         return {
-            encrypt: (data) => wordsToBytes(aes.encrypt(data)),
-            decrypt: (data) => wordsToBytes(aes.decrypt(data)),
+            process: (data) => ctr256(ctx, data),
+            close: () => freeCtr256(ctx),
         }
     }
 
-    createAesEcb(key: Uint8Array): IEncryptionScheme {
-        const aes = new AES(key)
-
+    createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme {
         return {
-            encrypt: (data) => wordsToBytes(aes.encrypt(data)),
-            decrypt: (data) => wordsToBytes(aes.decrypt(data)),
+            encrypt: (data) => ige256Encrypt(data, key, iv),
+            decrypt: (data) => ige256Decrypt(data, key, iv),
         }
     }
+
+    gzip(data: Uint8Array, maxSize: number): Uint8Array | null {
+        return deflateMaxSize(data, maxSize)
+    }
+
+    gunzip(data: Uint8Array): Uint8Array {
+        return gunzip(data)
+    }
 }
diff --git a/packages/core/src/utils/platform/crypto.ts b/packages/core/src/utils/platform/crypto.ts
index e2b8c3d6..52abfb45 100644
--- a/packages/core/src/utils/platform/crypto.ts
+++ b/packages/core/src/utils/platform/crypto.ts
@@ -1,4 +1,4 @@
-import { NodeCryptoProvider } from '../crypto/node-crypto.js'
+import { NodeCryptoProvider } from '../crypto/node.js'
 
 /** @internal */
 export const _defaultCryptoProviderFactory = () => new NodeCryptoProvider()
diff --git a/packages/core/src/utils/platform/crypto.web.ts b/packages/core/src/utils/platform/crypto.web.ts
index 5e792bbf..2f27c213 100644
--- a/packages/core/src/utils/platform/crypto.web.ts
+++ b/packages/core/src/utils/platform/crypto.web.ts
@@ -1,5 +1,5 @@
 import { MtUnsupportedError } from '../../index.js'
-import { SubtleCryptoProvider } from '../crypto/subtle.js'
+import { WebCryptoProvider } from '../crypto/web.js'
 
 /** @internal */
 export const _defaultCryptoProviderFactory = () => {
@@ -7,5 +7,5 @@ export const _defaultCryptoProviderFactory = () => {
         throw new MtUnsupportedError('WebCrypto API is not available')
     }
 
-    return new SubtleCryptoProvider(crypto.subtle)
+    return new WebCryptoProvider({ subtle: crypto.subtle })
 }
diff --git a/packages/core/tests/auth-key.spec.ts b/packages/core/tests/auth-key.spec.ts
index 880ca391..eccc8753 100644
--- a/packages/core/tests/auth-key.spec.ts
+++ b/packages/core/tests/auth-key.spec.ts
@@ -6,7 +6,7 @@ import { describe, it } from 'mocha'
 import { TlReaderMap } from '@mtcute/tl-runtime'
 
 import { AuthKey } from '../src/network/auth-key.js'
-import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js'
+import { NodeCryptoProvider } from '../src/utils/crypto/node.js'
 import { LogManager } from '../src/utils/index.js'
 
 chai.use(spies)
diff --git a/packages/core/tests/crypto-providers.spec.ts b/packages/core/tests/crypto-providers.spec.ts
index 2d0a410a..ea89492d 100644
--- a/packages/core/tests/crypto-providers.spec.ts
+++ b/packages/core/tests/crypto-providers.spec.ts
@@ -4,11 +4,13 @@ import { describe, it } from 'mocha'
 
 import { hexDecodeToBuffer, hexEncode, utf8EncodeToBuffer } from '@mtcute/tl-runtime'
 
-import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js'
-import { SubtleCryptoProvider } from '../src/utils/crypto/subtle.js'
+import { NodeCryptoProvider } from '../src/utils/crypto/node.js'
+import { WebCryptoProvider } from '../src/utils/crypto/web.js'
 import { ICryptoProvider } from '../src/utils/index.js'
 
 export function testCryptoProvider(c: ICryptoProvider): void {
+    before(() => c.initialize?.())
+
     it('should calculate sha1', async () => {
         expect(hexEncode(await c.sha1(utf8EncodeToBuffer('')))).to.eq('da39a3ee5e6b4b0d3255bfef95601890afd80709')
         expect(hexEncode(await c.sha1(utf8EncodeToBuffer('hello')))).to.eq('aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
@@ -47,81 +49,51 @@ export function testCryptoProvider(c: ICryptoProvider): void {
         )
     })
 
-    it('should encrypt and decrypt aes-ctr', async () => {
-        let aes = await c.createAesCtr(
+    it('should encrypt and decrypt aes-ctr', () => {
+        let aes = c.createAesCtr(
             hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'),
             hexDecodeToBuffer('0182de2bd789c295c3c6c875c5e9e190'),
             true,
         )
 
         const data = hexDecodeToBuffer('7baae571e4c2f4cfadb1931d5923aca7')
-        expect(hexEncode(await aes.encrypt(data))).eq('df5647dbb70bc393f2fb05b72f42286f')
-        expect(hexEncode(await aes.encrypt(data))).eq('3917147082672516b3177150129bc579')
-        expect(hexEncode(await aes.encrypt(data))).eq('2a7a9089270a5de45d5e3dd399cac725')
-        expect(hexEncode(await aes.encrypt(data))).eq('56d085217771398ac13583de4d677dd8')
-        expect(hexEncode(await aes.encrypt(data))).eq('cc639b488126cf36e79c4515e8012b92')
-        expect(hexEncode(await aes.encrypt(data))).eq('01384d100646cd562cc5586ec3f8f8c4')
+        expect(hexEncode(aes.process(data))).eq('df5647dbb70bc393f2fb05b72f42286f')
+        expect(hexEncode(aes.process(data))).eq('3917147082672516b3177150129bc579')
+        expect(hexEncode(aes.process(data))).eq('2a7a9089270a5de45d5e3dd399cac725')
+        expect(hexEncode(aes.process(data))).eq('56d085217771398ac13583de4d677dd8')
+        expect(hexEncode(aes.process(data))).eq('cc639b488126cf36e79c4515e8012b92')
+        expect(hexEncode(aes.process(data))).eq('01384d100646cd562cc5586ec3f8f8c4')
 
-        aes = await c.createAesCtr(
+        aes.close?.()
+        aes = c.createAesCtr(
             hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'),
             hexDecodeToBuffer('0182de2bd789c295c3c6c875c5e9e190'),
             false,
         )
 
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('df5647dbb70bc393f2fb05b72f42286f')))).eq(hexEncode(data))
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('3917147082672516b3177150129bc579')))).eq(hexEncode(data))
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('2a7a9089270a5de45d5e3dd399cac725')))).eq(hexEncode(data))
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('56d085217771398ac13583de4d677dd8')))).eq(hexEncode(data))
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('cc639b488126cf36e79c4515e8012b92')))).eq(hexEncode(data))
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('01384d100646cd562cc5586ec3f8f8c4')))).eq(hexEncode(data))
+        expect(hexEncode(aes.process(hexDecodeToBuffer('df5647dbb70bc393f2fb05b72f42286f')))).eq(hexEncode(data))
+        expect(hexEncode(aes.process(hexDecodeToBuffer('3917147082672516b3177150129bc579')))).eq(hexEncode(data))
+        expect(hexEncode(aes.process(hexDecodeToBuffer('2a7a9089270a5de45d5e3dd399cac725')))).eq(hexEncode(data))
+        expect(hexEncode(aes.process(hexDecodeToBuffer('56d085217771398ac13583de4d677dd8')))).eq(hexEncode(data))
+        expect(hexEncode(aes.process(hexDecodeToBuffer('cc639b488126cf36e79c4515e8012b92')))).eq(hexEncode(data))
+        expect(hexEncode(aes.process(hexDecodeToBuffer('01384d100646cd562cc5586ec3f8f8c4')))).eq(hexEncode(data))
+
+        aes.close?.()
     })
 
-    it('should encrypt and decrypt aes-ecb', async () => {
-        let aes = await c.createAesEcb(
-            hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'),
-        )
-
-        expect(hexEncode(await aes.encrypt(hexDecodeToBuffer('f71eed6018f1ef976d39c19f9d29fd29')))).eq(
-            '038ef30acb438b64159f484aec541fd2',
-        )
-        expect(hexEncode(await aes.encrypt(hexDecodeToBuffer('f71eed6018f1ef976d39c19f9d29fd29')))).eq(
-            '038ef30acb438b64159f484aec541fd2',
-        )
-        expect(hexEncode(await aes.encrypt(hexDecodeToBuffer('460af382084b7960d2e9f3bca4cdc25b')))).eq(
-            '29c3af710c3c56f7fbb97ca06af3b974',
-        )
-
-        aes = await c.createAesEcb(
-            hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'),
-        )
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('038ef30acb438b64159f484aec541fd2')))).eq(
-            'f71eed6018f1ef976d39c19f9d29fd29',
-        )
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('038ef30acb438b64159f484aec541fd2')))).eq(
-            'f71eed6018f1ef976d39c19f9d29fd29',
-        )
-        expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('29c3af710c3c56f7fbb97ca06af3b974')))).eq(
-            '460af382084b7960d2e9f3bca4cdc25b',
-        )
-    })
-
-    it('should encrypt and decrypt aes-ige', async () => {
-        const aes = await c.createAesIge(
+    it('should encrypt and decrypt aes-ige', () => {
+        const aes = c.createAesIge(
             hexDecodeToBuffer('5468697320697320616E20696D706C655468697320697320616E20696D706C65'),
             hexDecodeToBuffer('6D656E746174696F6E206F6620494745206D6F646520666F72204F70656E5353'),
         )
         expect(
             hexEncode(
-                await aes.encrypt(
-                    hexDecodeToBuffer('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b'),
-                ),
+                aes.encrypt(hexDecodeToBuffer('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b')),
             ),
         ).to.eq('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69')
         expect(
             hexEncode(
-                await aes.decrypt(
-                    hexDecodeToBuffer('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69'),
-                ),
+                aes.decrypt(hexDecodeToBuffer('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69')),
             ),
         ).to.eq('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b')
     })
@@ -137,12 +109,12 @@ describe('NodeCryptoProvider', () => {
     testCryptoProvider(new NodeCryptoProvider())
 })
 
-describe('SubtleCryptoProvider', () => {
+describe('WebCryptoProvider', () => {
     if (typeof crypto.subtle === 'undefined') {
-        console.warn('Skipping SubtleCryptoProvider tests')
+        console.warn('Skipping WebCryptoProvider tests')
 
         return
     }
 
-    testCryptoProvider(new SubtleCryptoProvider(crypto.subtle))
+    testCryptoProvider(new WebCryptoProvider({ subtle: crypto.subtle }))
 })
diff --git a/packages/core/tests/keys.spec.ts b/packages/core/tests/keys.spec.ts
index a85415cc..ae912513 100644
--- a/packages/core/tests/keys.spec.ts
+++ b/packages/core/tests/keys.spec.ts
@@ -1,7 +1,7 @@
 import { expect } from 'chai'
 import { describe, it } from 'mocha'
 
-import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js'
+import { NodeCryptoProvider } from '../src/utils/crypto/node.js'
 import { parsePublicKey } from '../src/utils/index.js'
 
 const crypto = new NodeCryptoProvider()
diff --git a/packages/core/tests/mtproto-crypto.spec.ts b/packages/core/tests/mtproto-crypto.spec.ts
index c21340f2..1ded2b61 100644
--- a/packages/core/tests/mtproto-crypto.spec.ts
+++ b/packages/core/tests/mtproto-crypto.spec.ts
@@ -9,7 +9,7 @@ import {
     createAesIgeForMessageOld,
     generateKeyAndIvFromNonce,
 } from '../src/utils/crypto/mtproto.js'
-import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js'
+import { NodeCryptoProvider } from '../src/utils/crypto/node.js'
 
 chai.use(spies)
 
diff --git a/packages/crypto-node/.gitignore b/packages/crypto-node/.gitignore
index 1a53b9f0..a17106b9 100644
--- a/packages/crypto-node/.gitignore
+++ b/packages/crypto-node/.gitignore
@@ -1,2 +1,2 @@
 .vs
-build
+build
\ No newline at end of file
diff --git a/packages/crypto-node/src/index.ts b/packages/crypto-node/src/index.ts
index 21521222..53c39f41 100644
--- a/packages/crypto-node/src/index.ts
+++ b/packages/crypto-node/src/index.ts
@@ -1,4 +1,4 @@
-import { NodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node-crypto.js'
+import { BaseNodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node.js'
 import { IEncryptionScheme } from '@mtcute/core/utils.js'
 
 import { native } from './native.cjs'
@@ -13,7 +13,7 @@ const { ige256_decrypt, ige256_encrypt } = native
  * Other modes are supported natively by OpenSSL, and
  * they *are* faster than the custom ones.
  */
-export class NodeNativeCryptoProvider extends NodeCryptoProvider {
+export class NodeNativeCryptoProvider extends BaseNodeCryptoProvider {
     createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme {
         return {
             encrypt(data: Uint8Array): Uint8Array {
diff --git a/packages/crypto/README.md b/packages/crypto/README.md
deleted file mode 100644
index c2d2a6e9..00000000
--- a/packages/crypto/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# This is not a package
-This is just a bunch of C files that are imported in `crypto-*` packages.
-
-## Acknowledgements
-This code is based on [pyrogram/tgcrypto](https://github.com/pyrogram/tgcrypto)
\ No newline at end of file
diff --git a/packages/crypto/cbc256.c b/packages/crypto/cbc256.c
deleted file mode 100644
index 0077c73d..00000000
--- a/packages/crypto/cbc256.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "aes256.h"
-
-void cbc256_encrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out) {
-    uint32_t expandedKey[EXPANDED_KEY_SIZE];
-    uint32_t i, j;
-
-    uint8_t* currentIv = iv;
-
-    aes256_set_encryption_key(key, expandedKey);
-
-    for (i = 0; i < length; i += AES_BLOCK_SIZE) {
-        for (j = 0; j < AES_BLOCK_SIZE; ++j)
-            out[i + j] = in[i + j] ^ currentIv[j];
-
-        aes256_encrypt(&out[i], &out[i], expandedKey);
-        currentIv = &out[i];
-    }
-}
-
-void cbc256_decrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out) {
-    uint32_t expandedKey[EXPANDED_KEY_SIZE];
-    uint32_t i, j;
-
-    uint8_t* currentIv = iv;
-
-    aes256_set_decryption_key(key, expandedKey);
-
-    for (i = 0; i < length; i += AES_BLOCK_SIZE) {
-        aes256_decrypt(&in[i], &out[i], expandedKey);
-
-        for (j = 0; j < AES_BLOCK_SIZE; ++j)
-            out[i + j] ^= currentIv[j];
-
-        currentIv = &in[i];
-    }
-}
\ No newline at end of file
diff --git a/packages/crypto/cbc256.h b/packages/crypto/cbc256.h
deleted file mode 100644
index b49179fa..00000000
--- a/packages/crypto/cbc256.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <stdint.h>
-
-#ifndef CBC256_H
-#define CBC256_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void cbc256_encrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out);
-void cbc256_decrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // CBC256_H
diff --git a/packages/crypto/ctr256.c b/packages/crypto/ctr256.c
deleted file mode 100644
index bca6a194..00000000
--- a/packages/crypto/ctr256.c
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "aes256.h"
-
-#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-
-void ctr256(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* counter, uint8_t* out) {
-    uint8_t chunk[AES_BLOCK_SIZE];
-    uint32_t expandedKey[EXPANDED_KEY_SIZE];
-    uint32_t i, j, k;
-
-    memcpy(out, in, length);
-    aes256_set_encryption_key(key, expandedKey);
-
-    aes256_encrypt(iv, chunk, expandedKey);
-
-    for (i = 0; i < length; i += AES_BLOCK_SIZE) {
-        for (j = 0; j < MIN(length - i, AES_BLOCK_SIZE); ++j) {
-            out[i + j] ^= chunk[(*counter)++];
-
-            if (*counter >= AES_BLOCK_SIZE)
-                *counter = 0;
-
-            if (*counter == 0) {
-                k = AES_BLOCK_SIZE;
-                while(k--)
-                    if (++iv[k])
-                        break;
-
-                aes256_encrypt(iv, chunk, expandedKey);
-            }
-        }
-    }
-}
diff --git a/packages/crypto/ctr256.h b/packages/crypto/ctr256.h
deleted file mode 100644
index c74065f7..00000000
--- a/packages/crypto/ctr256.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <stdint.h>
-
-#ifndef CTR256_H
-#define CTR256_H
-
-extern "C" uint8_t* ctr256(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* state, uint8_t* out);
-
-#endif  // CTR256_H
diff --git a/packages/tl-runtime/package.json b/packages/tl-runtime/package.json
index f12a6802..b1f09465 100644
--- a/packages/tl-runtime/package.json
+++ b/packages/tl-runtime/package.json
@@ -17,9 +17,7 @@
         "./cjs/encodings/hex.js": "./cjs/encodings/hex.web.js",
         "./esm/encodings/hex.js": "./esm/encodings/hex.web.js",
         "./cjs/encodings/utf8.js": "./cjs/encodings/utf8.web.js",
-        "./esm/encodings/utf8.js": "./esm/encodings/utf8.web.js",
-        "./cjs/encodings/gzip.js": "./cjs/encodings/gzip.web.js",
-        "./esm/encodings/gzip.js": "./esm/encodings/gzip.web.js"
+        "./esm/encodings/utf8.js": "./esm/encodings/utf8.web.js"
     },
     "distOnlyFields": {
         "exports": {
@@ -31,10 +29,6 @@
     },
     "main": "src/index.ts",
     "dependencies": {
-        "long": "5.2.3",
-        "pako": "2.1.0"
-    },
-    "devDependencies": {
-        "@types/pako": "2.0.0"
+        "long": "5.2.3"
     }
 }
diff --git a/packages/tl-runtime/src/encodings/gzip.ts b/packages/tl-runtime/src/encodings/gzip.ts
deleted file mode 100644
index f865052b..00000000
--- a/packages/tl-runtime/src/encodings/gzip.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-/* eslint-disable no-restricted-globals */
-
-import { deflateSync, gunzipSync } from 'node:zlib'
-
-/**
- * Decompress a buffer with gzip.
- * @param buf  Buffer to decompress
- */
-export function gzipInflate(buf: Uint8Array): Uint8Array {
-    return gunzipSync(buf)
-}
-
-/**
- * Compress a buffer with gzip.
- *
- * @param buf  Buffer to compress
- * @param maxRatio
- *   Maximum compression ratio. If the resulting buffer is smaller than
- *   `buf.length * ratio`, `null` is returned.
- */
-export function gzipDeflate(buf: ArrayBuffer, maxRatio?: number): Buffer | null {
-    if (maxRatio) {
-        try {
-            return deflateSync(buf, {
-                maxOutputLength: Math.floor(buf.byteLength * maxRatio),
-            })
-            // hot path, avoid additional runtime checks
-            // eslint-disable-next-line @typescript-eslint/no-explicit-any
-        } catch (e: any) {
-            if (e.code === 'ERR_BUFFER_TOO_LARGE') {
-                return null
-            }
-
-            throw e
-        }
-    }
-
-    return deflateSync(buf)
-}
diff --git a/packages/tl-runtime/src/encodings/gzip.web.ts b/packages/tl-runtime/src/encodings/gzip.web.ts
deleted file mode 100644
index 797f0d0f..00000000
--- a/packages/tl-runtime/src/encodings/gzip.web.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-import { Data, Deflate, inflate } from 'pako'
-
-export function gzipInflate(buf: Uint8Array): Uint8Array {
-    return inflate(buf)
-}
-
-const ERROR_SIZE_LIMIT_REACHED = 'ERR_SIZE_LIMIT_REACHED'
-
-class DeflateLimited extends Deflate {
-    constructor(readonly limit: number) {
-        super()
-    }
-
-    _size = 0
-
-    onData(chunk: Data) {
-        this._size += (chunk as Uint8Array).length
-
-        if (this._size > this.limit) {
-            // caught locally
-            // eslint-disable-next-line @typescript-eslint/no-throw-literal
-            throw ERROR_SIZE_LIMIT_REACHED
-        }
-
-        super.onData(chunk)
-    }
-}
-
-export function gzipDeflate(buf: Uint8Array, maxRatio?: number): Uint8Array | null {
-    const deflator = maxRatio ? new DeflateLimited(Math.floor(buf.length * maxRatio)) : new Deflate()
-
-    try {
-        deflator.push(buf, true)
-    } catch (e) {
-        if (e === ERROR_SIZE_LIMIT_REACHED) return null
-        throw e
-    }
-
-    return deflator.result
-}
diff --git a/packages/tl-runtime/src/encodings/index.ts b/packages/tl-runtime/src/encodings/index.ts
index 3c14a235..ce22da50 100644
--- a/packages/tl-runtime/src/encodings/index.ts
+++ b/packages/tl-runtime/src/encodings/index.ts
@@ -1,4 +1,3 @@
 export * from './base64.js'
-export * from './gzip.js'
 export * from './hex.js'
 export * from './utf8.js'
diff --git a/packages/tl-runtime/src/index.ts b/packages/tl-runtime/src/index.ts
index b112fec0..def86778 100644
--- a/packages/tl-runtime/src/index.ts
+++ b/packages/tl-runtime/src/index.ts
@@ -1,5 +1,4 @@
 export * from './encodings/base64.js'
-export * from './encodings/gzip.js'
 export * from './encodings/hex.js'
 export * from './encodings/utf8.js'
 export * from './reader.js'
diff --git a/packages/tl-runtime/src/reader.ts b/packages/tl-runtime/src/reader.ts
index a2f2ca0e..faa247c8 100644
--- a/packages/tl-runtime/src/reader.ts
+++ b/packages/tl-runtime/src/reader.ts
@@ -1,6 +1,5 @@
 import Long from 'long'
 
-import { gzipInflate } from './encodings/gzip.js'
 import { hexEncode } from './encodings/hex.js'
 import { utf8Decode } from './encodings/utf8.js'
 
@@ -178,13 +177,10 @@ export class TlBinaryReader {
         return utf8Decode(this.bytes())
     }
 
-    object(): unknown {
-        const id = this.uint()
-
+    object(id = this.uint()): unknown {
         if (id === 0x1cb5c415 /* vector */) {
             return this.vector(this.object, true)
         }
-        if (id === 0x3072cfa1 /* gzip_packed */) return this.gzip()
         if (id === 0xbc799737 /* boolFalse */) return false
         if (id === 0x997275b5 /* boolTrue */) return true
         // unsure if it is actually used in the wire, seems like it's only used for boolean flags
@@ -209,10 +205,6 @@ export class TlBinaryReader {
         return reader(this)
     }
 
-    gzip(): unknown {
-        return new TlBinaryReader(this.objectsMap, gzipInflate(this.bytes())).object()
-    }
-
     vector(reader = this.object, bare = false): unknown[] {
         if (!bare) {
             const uint = this.uint()
diff --git a/packages/tl/scripts/gen-rsa-keys.ts b/packages/tl/scripts/gen-rsa-keys.ts
index 05dbab3b..e5b6cd85 100644
--- a/packages/tl/scripts/gen-rsa-keys.ts
+++ b/packages/tl/scripts/gen-rsa-keys.ts
@@ -3,7 +3,7 @@ import { writeFile } from 'fs/promises'
 import { join } from 'path'
 import readline from 'readline'
 
-import { NodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node-crypto.js'
+import { NodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node.js'
 import { parsePublicKey } from '@mtcute/core/utils.js'
 
 import { TlPublicKey } from '../binary/rsa-keys.js'
diff --git a/packages/wasm/.gitignore b/packages/wasm/.gitignore
new file mode 100644
index 00000000..c795b054
--- /dev/null
+++ b/packages/wasm/.gitignore
@@ -0,0 +1 @@
+build
\ No newline at end of file
diff --git a/packages/wasm/README.md b/packages/wasm/README.md
new file mode 100644
index 00000000..ea528ffb
--- /dev/null
+++ b/packages/wasm/README.md
@@ -0,0 +1,19 @@
+# @mtcute/wasm
+
+📖 [API Reference](https://ref.mtcute.dev/modules/_mtcute_wasm.html)
+
+Highly optimized for size & speed WASM implementation of common algorithms used in Telegram.
+
+## Features
+- **Super lightweight**: Only 45 KB raw, 22 KB gzipped
+- **Blazingly fast**: Up to 10x faster than pure JS implementations
+- Implements AES IGE and Deflate (zlib compression + gunzip), which are not available in some environments (e.g. web)
+
+## Acknowledgements
+- Deflate is implemented through a modified version of [libdeflate](https://github.com/ebiggers/libdeflate), MIT license.
+  - Modified by [kamillaova](https://github.com/kamillaova) to support WASM and improve bundle size
+- AES IGE code is mostly based on [tgcrypto](https://github.com/pyrogram/tgcrypto), LGPL-3.0 license.
+  - To comply with LGPL-3.0, the source code of the modified tgcrypto is available [here](./lib/crypto/) under LGPL-3.0 license.
+
+## Benchmarks
+See https://github.com/mtcute/benchmarks
\ No newline at end of file
diff --git a/packages/wasm/build.config.cjs b/packages/wasm/build.config.cjs
new file mode 100644
index 00000000..a8c383ce
--- /dev/null
+++ b/packages/wasm/build.config.cjs
@@ -0,0 +1,20 @@
+// /* eslint-disable no-console */
+// import *  cp from 'child_process'
+// import * as fs from 'fs'
+// import { join } from 'path'
+
+// const root = new URL('.', import.meta.url).pathname
+
+module.exports = ({ path: { join }, fs, outDir, packageDir, transformFile }) => ({
+    esmOnlyDirectives: true,
+    final() {
+        const fixWasmPath = (path) => {
+            transformFile(join(outDir, path), (data) => data.replace('../lib/mtcute.wasm', '../mtcute.wasm'))
+        }
+
+        fixWasmPath('cjs/init.js')
+        fixWasmPath('esm/init.js')
+
+        fs.cpSync(join(packageDir, 'lib/mtcute.wasm'), join(outDir, 'mtcute.wasm'))
+    },
+})
diff --git a/packages/wasm/lib/Dockerfile b/packages/wasm/lib/Dockerfile
new file mode 100644
index 00000000..6ccbe96e
--- /dev/null
+++ b/packages/wasm/lib/Dockerfile
@@ -0,0 +1,14 @@
+FROM alpine:3.18.4 AS build
+
+WORKDIR /src
+
+RUN apk add --no-cache lld make clang16 binaryen
+
+COPY crypto /src/crypto
+COPY libdeflate /src/libdeflate
+COPY *.h *.c Makefile /src/
+
+RUN ZLIB_COMPRESSION_API=1 GZIP_DECOMPRESSION_API=1 IGE_API=1 CTR_API=1 make
+
+FROM scratch AS binaries
+COPY --from=build /src/mtcute.wasm /
diff --git a/packages/wasm/lib/Makefile b/packages/wasm/lib/Makefile
new file mode 100644
index 00000000..1a403a01
--- /dev/null
+++ b/packages/wasm/lib/Makefile
@@ -0,0 +1,75 @@
+.PHONY: all clean
+
+DEFAULT_API ?= 0
+
+DEFLATE_COMPRESSION_API ?= $(DEFAULT_API)
+DEFLATE_DECOMPRESSION_API ?= $(DEFAULT_API)
+GZIP_COMPRESSION_API ?= $(DEFAULT_API)
+GZIP_DECOMPRESSION_API ?= $(DEFAULT_API)
+ZLIB_COMPRESSION_API ?= $(DEFAULT_API)
+ZLIB_DECOMPRESSION_API ?= $(DEFAULT_API)
+CRC32_API ?= $(DEFAULT_API)
+ADLER32_API ?= $(DEFAULT_API)
+IGE_API ?= $(DEFAULT_API)
+CTR_API ?= $(DEFAULT_API)
+
+CRC32 ?= 0
+
+LOGGING ?= 0
+
+_DEFLATE_COMPRESSION := 1
+_DEFLATE_DECOMPRESSION := 1
+_ADLER32 := $(findstring 1, $(ZLIB_COMPRESSION_API)$(ZLIB_DECOMPRESSION_API))
+_AES := $(findstring 1, $(IGE_API)$(CTR_API))
+
+SOURCES = utils.c \
+	$(if $(filter 1, $(_DEFLATE_COMPRESSION)), libdeflate/deflate_compress.c) \
+	$(if $(filter 1, $(_DEFLATE_DECOMPRESSION)), libdeflate/deflate_decompress.c) \
+	$(if $(filter 1, $(GZIP_COMPRESSION_API)), libdeflate/gzip_compress.c) \
+	$(if $(filter 1, $(GZIP_DECOMPRESSION_API)), libdeflate/gzip_decompress.c) \
+	$(if $(filter 1, $(ZLIB_COMPRESSION_API)), libdeflate/zlib_compress.c) \
+	$(if $(filter 1, $(ZLIB_DECOMPRESSION_API)), libdeflate/zlib_decompress.c) \
+	$(if $(filter 1, $(CRC32)), libdeflate/crc32.c) \
+	$(if $(filter 1, $(_ADLER32)), libdeflate/adler32.c) \
+	$(if $(filter 1, $(_AES)), crypto/aes256.c) \
+	$(if $(filter 1, $(IGE_API)), crypto/ige256.c) \
+	$(if $(filter 1, $(CTR_API)), crypto/ctr256.c)
+
+CC := clang
+CFLAGS_WASM := \
+	-target wasm32-unknown-unknown \
+	-nostdlib -ffreestanding -DFREESTANDING \
+	$(if $(filter 1, $(LOGGING)), -DLOGGING) \
+	-mbulk-memory \
+	-Wl,--no-entry,--export-dynamic,--lto-O3
+
+CFLAGS := $(CFLAGS_WASM) \
+	-O3 \
+	-Qn \
+	-DNDEBUG \
+	-mno-exception-handling \
+	-fdelete-null-pointer-checks \
+	-fno-stack-protector \
+	-flto=full \
+	-fdata-sections \
+	-ffunction-sections \
+	-Wl,--gc-sections \
+	-fno-inline \
+	-fno-unroll-loops
+
+ifneq ($(OS),Windows_NT)
+    UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S),Darwin)
+		export PATH := /opt/homebrew/opt/llvm/bin/:$(PATH)
+    endif
+endif
+
+OUT := mtcute.wasm
+
+$(OUT): $(SOURCES)
+	$(CC) $(CFLAGS) -I . -o $@ $^
+
+clean:
+	rm -f $(OUT)
+
+all: $(OUT)
diff --git a/packages/wasm/lib/common_defs.h b/packages/wasm/lib/common_defs.h
new file mode 100644
index 00000000..ce3eaf74
--- /dev/null
+++ b/packages/wasm/lib/common_defs.h
@@ -0,0 +1,686 @@
+/*
+ * common_defs.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef COMMON_DEFS_H
+#define COMMON_DEFS_H
+
+#include "libdeflate.h"
+
+#include <stdbool.h>
+#include <stddef.h>	/* for size_t */
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>	/* for _BitScan*() and other intrinsics */
+#  include <stdlib.h>	/* for _byteswap_*() */
+   /* Disable MSVC warnings that are expected. */
+   /* /W2 */
+#  pragma warning(disable : 4146) /* unary minus on unsigned type */
+   /* /W3 */
+#  pragma warning(disable : 4018) /* signed/unsigned mismatch */
+#  pragma warning(disable : 4244) /* possible loss of data */
+#  pragma warning(disable : 4267) /* possible loss of precision */
+#  pragma warning(disable : 4310) /* cast truncates constant value */
+   /* /W4 */
+#  pragma warning(disable : 4100) /* unreferenced formal parameter */
+#  pragma warning(disable : 4127) /* conditional expression is constant */
+#  pragma warning(disable : 4189) /* local variable initialized but not referenced */
+#  pragma warning(disable : 4232) /* nonstandard extension used */
+#  pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */
+#  pragma warning(disable : 4295) /* array too small to include terminating null */
+#endif
+
+/* ========================================================================== */
+/*                             Target architecture                            */
+/* ========================================================================== */
+
+/* If possible, define a compiler-independent ARCH_* macro. */
+#undef ARCH_X86_64
+#undef ARCH_X86_32
+#undef ARCH_ARM64
+#undef ARCH_ARM32
+#ifdef _MSC_VER
+#  if defined(_M_X64)
+#    define ARCH_X86_64
+#  elif defined(_M_IX86)
+#    define ARCH_X86_32
+#  elif defined(_M_ARM64)
+#    define ARCH_ARM64
+#  elif defined(_M_ARM)
+#    define ARCH_ARM32
+#  endif
+#else
+#  if defined(__x86_64__)
+#    define ARCH_X86_64
+#  elif defined(__i386__)
+#    define ARCH_X86_32
+#  elif defined(__aarch64__)
+#    define ARCH_ARM64
+#  elif defined(__arm__)
+#    define ARCH_ARM32
+#  endif
+#endif
+
+/* ========================================================================== */
+/*                              Type definitions                              */
+/* ========================================================================== */
+
+/* Fixed-width integer types */
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+/* ssize_t, if not available in <sys/types.h> */
+#ifdef _MSC_VER
+#  ifdef _WIN64
+     typedef long long ssize_t;
+#  else
+     typedef long ssize_t;
+#  endif
+#endif
+
+/*
+ * Word type of the target architecture.  Use 'size_t' instead of
+ * 'unsigned long' to account for platforms such as Windows that use 32-bit
+ * 'unsigned long' on 64-bit architectures.
+ */
+typedef size_t machine_word_t;
+
+/* Number of bytes in a word */
+#define WORDBYTES	((int)sizeof(machine_word_t))
+
+/* Number of bits in a word */
+#define WORDBITS	(8 * WORDBYTES)
+
+/* ========================================================================== */
+/*                         Optional compiler features                         */
+/* ========================================================================== */
+
+/* Compiler version checks.  Only use when absolutely necessary. */
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define GCC_PREREQ(major, minor)		\
+	(__GNUC__ > (major) ||			\
+	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+#  define GCC_PREREQ(major, minor)	0
+#endif
+#ifdef __clang__
+#  ifdef __apple_build_version__
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__apple_build_version__ >= (apple_version))
+#  else
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__clang_major__ > (major) ||			\
+	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
+#  endif
+#else
+#  define CLANG_PREREQ(major, minor, apple_version)	0
+#endif
+
+/*
+ * Macros to check for compiler support for attributes and builtins.  clang
+ * implements these macros, but gcc doesn't, so generally any use of one of
+ * these macros must also be combined with a gcc version check.
+ */
+#ifndef __has_attribute
+#  define __has_attribute(attribute)	0
+#endif
+#ifndef __has_builtin
+#  define __has_builtin(builtin)	0
+#endif
+
+/*
+ * restrict - hint that writes only occur through the given pointer.
+ *
+ * Don't use MSVC's __restrict, since it has nonstandard behavior.
+ * Standard restrict is okay, if it is supported.
+ */
+#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define restrict		__restrict__
+#  else
+#    define restrict
+#  endif
+#endif /* else assume 'restrict' is usable as-is */
+
+/* likely(expr) - hint that an expression is usually true */
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
+#  define likely(expr)		__builtin_expect(!!(expr), 1)
+#else
+#  define likely(expr)		(expr)
+#endif
+
+/* unlikely(expr) - hint that an expression is usually false */
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
+#  define unlikely(expr)	__builtin_expect(!!(expr), 0)
+#else
+#  define unlikely(expr)	(expr)
+#endif
+
+/* prefetchr(addr) - prefetch into L1 cache for read */
+#undef prefetchr
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
+#  define prefetchr(addr)	__builtin_prefetch((addr), 0)
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchr(addr)	_mm_prefetch((addr), _MM_HINT_T0)
+#  elif defined(ARCH_ARM64)
+#    define prefetchr(addr)	__prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchr(addr)	__prefetch(addr)
+#  endif
+#endif
+#ifndef prefetchr
+#  define prefetchr(addr)
+#endif
+
+/* prefetchw(addr) - prefetch into L1 cache for write */
+#undef prefetchw
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
+#  define prefetchw(addr)	__builtin_prefetch((addr), 1)
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchw(addr)	_m_prefetchw(addr)
+#  elif defined(ARCH_ARM64)
+#    define prefetchw(addr)	__prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchw(addr)	__prefetchw(addr)
+#  endif
+#endif
+#ifndef prefetchw
+#  define prefetchw(addr)
+#endif
+
+/*
+ * _aligned_attribute(n) - declare that the annotated variable, or variables of
+ * the annotated type, must be aligned on n-byte boundaries.
+ */
+#undef _aligned_attribute
+#if defined(__GNUC__) || __has_attribute(aligned)
+#  define _aligned_attribute(n)	__attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#  define _aligned_attribute(n)	__declspec(align(n))
+#endif
+
+/*
+ * _target_attribute(attrs) - override the compilation target for a function.
+ *
+ * This accepts one or more comma-separated suffixes to the -m prefix jointly
+ * forming the name of a machine-dependent option.  On gcc-like compilers, this
+ * enables codegen for the given targets, including arbitrary compiler-generated
+ * code as well as the corresponding intrinsics.  On other compilers this macro
+ * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
+ */
+#if GCC_PREREQ(4, 4) || __has_attribute(target)
+#  define _target_attribute(attrs)	__attribute__((target(attrs)))
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	1
+#else
+#  define _target_attribute(attrs)
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	0
+#endif
+
+/* ========================================================================== */
+/*                          Miscellaneous macros                              */
+/* ========================================================================== */
+
+#define ARRAY_LEN(A)		(sizeof(A) / sizeof((A)[0]))
+#define MIN(a, b)		((a) <= (b) ? (a) : (b))
+#define MAX(a, b)		((a) >= (b) ? (a) : (b))
+#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
+#define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
+#define ALIGN(n, a)		(((n) + (a) - 1) & ~((a) - 1))
+#define ROUND_UP(n, d)		((d) * DIV_ROUND_UP((n), (d)))
+
+/* ========================================================================== */
+/*                           Endianness handling                              */
+/* ========================================================================== */
+
+/*
+ * CPU_IS_LITTLE_ENDIAN() - 1 if the CPU is little endian, or 0 if it is big
+ * endian.  When possible this is a compile-time macro that can be used in
+ * preprocessor conditionals.  As a fallback, a generic method is used that
+ * can't be used in preprocessor conditionals but should still be optimized out.
+ */
+#if defined(__BYTE_ORDER__) /* gcc v4.6+ and clang */
+#  define CPU_IS_LITTLE_ENDIAN()  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#elif defined(_MSC_VER)
+#  define CPU_IS_LITTLE_ENDIAN()  true
+#else
+static bool CPU_IS_LITTLE_ENDIAN(void)
+{
+	union {
+		u32 w;
+		u8 b;
+	} u;
+
+	u.w = 1;
+	return u.b;
+}
+#endif
+
+/* bswap16(v) - swap the bytes of a 16-bit integer */
+static u16 bswap16(u16 v)
+{
+#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+	return __builtin_bswap16(v);
+#elif defined(_MSC_VER)
+	return _byteswap_ushort(v);
+#else
+	return (v << 8) | (v >> 8);
+#endif
+}
+
+/* bswap32(v) - swap the bytes of a 32-bit integer */
+static u32 bswap32(u32 v)
+{
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+	return __builtin_bswap32(v);
+#elif defined(_MSC_VER)
+	return _byteswap_ulong(v);
+#else
+	return ((v & 0x000000FF) << 24) |
+	       ((v & 0x0000FF00) << 8) |
+	       ((v & 0x00FF0000) >> 8) |
+	       ((v & 0xFF000000) >> 24);
+#endif
+}
+
+/* bswap64(v) - swap the bytes of a 64-bit integer */
+static u64 bswap64(u64 v)
+{
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+	return __builtin_bswap64(v);
+#elif defined(_MSC_VER)
+	return _byteswap_uint64(v);
+#else
+	return ((v & 0x00000000000000FF) << 56) |
+	       ((v & 0x000000000000FF00) << 40) |
+	       ((v & 0x0000000000FF0000) << 24) |
+	       ((v & 0x00000000FF000000) << 8) |
+	       ((v & 0x000000FF00000000) >> 8) |
+	       ((v & 0x0000FF0000000000) >> 24) |
+	       ((v & 0x00FF000000000000) >> 40) |
+	       ((v & 0xFF00000000000000) >> 56);
+#endif
+}
+
+#define le16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap16(v))
+#define le32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap32(v))
+#define le64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap64(v))
+#define be16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap16(v) : (v))
+#define be32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap32(v) : (v))
+#define be64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap64(v) : (v))
+
+/* ========================================================================== */
+/*                          Unaligned memory accesses                         */
+/* ========================================================================== */
+
+/*
+ * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed
+ * efficiently on the target platform, otherwise 0.
+ */
+#if (defined(__GNUC__) || defined(__clang__)) && \
+	(defined(ARCH_X86_64) || defined(ARCH_X86_32) || \
+	 defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
+	 /*
+	  * For all compilation purposes, WebAssembly behaves like any other CPU
+	  * instruction set. Even though WebAssembly engine might be running on
+	  * top of different actual CPU architectures, the WebAssembly spec
+	  * itself permits unaligned access and it will be fast on most of those
+	  * platforms, and simulated at the engine level on others, so it's
+	  * worth treating it as a CPU architecture with fast unaligned access.
+	  */ defined(__wasm__))
+#  define UNALIGNED_ACCESS_IS_FAST	1
+#elif defined(_MSC_VER)
+#  define UNALIGNED_ACCESS_IS_FAST	1
+#else
+#  define UNALIGNED_ACCESS_IS_FAST	0
+#endif
+
+/*
+ * Implementing unaligned memory accesses using memcpy() is portable, and it
+ * usually gets optimized appropriately by modern compilers.  I.e., each
+ * memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled to a load or store
+ * instruction, not to an actual function call.
+ *
+ * We no longer use the "packed struct" approach to unaligned accesses, as that
+ * is nonstandard, has unclear semantics, and doesn't receive enough testing
+ * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
+ *
+ * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
+ * where memcpy() generates inefficient code
+ * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366).  However, we no longer
+ * consider that one case important enough to maintain different code for.
+ * If you run into it, please just use a newer version of gcc (or use clang).
+ */
+
+/* Unaligned loads and stores without endianness conversion */
+
+#define DEFINE_UNALIGNED_TYPE(type)				\
+static type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	type v;							\
+								\
+	__builtin_memcpy(&v, p, sizeof(v));				\
+	return v;						\
+}								\
+								\
+static void						\
+store_##type##_unaligned(type v, void *p)			\
+{								\
+	__builtin_memcpy(p, &v, sizeof(v));				\
+}
+
+DEFINE_UNALIGNED_TYPE(u16)
+DEFINE_UNALIGNED_TYPE(u32)
+DEFINE_UNALIGNED_TYPE(u64)
+DEFINE_UNALIGNED_TYPE(machine_word_t)
+
+#define load_word_unaligned	load_machine_word_t_unaligned
+#define store_word_unaligned	store_machine_word_t_unaligned
+
+/* Unaligned loads with endianness conversion */
+
+static u16
+get_unaligned_le16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le16_bswap(load_u16_unaligned(p));
+	else
+		return ((u16)p[1] << 8) | p[0];
+}
+
+static u16
+get_unaligned_be16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return be16_bswap(load_u16_unaligned(p));
+	else
+		return ((u16)p[0] << 8) | p[1];
+}
+
+static u32
+get_unaligned_le32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le32_bswap(load_u32_unaligned(p));
+	else
+		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
+			((u32)p[1] << 8) | p[0];
+}
+
+static u32
+get_unaligned_be32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return be32_bswap(load_u32_unaligned(p));
+	else
+		return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
+			((u32)p[2] << 8) | p[3];
+}
+
+static u64
+get_unaligned_le64(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le64_bswap(load_u64_unaligned(p));
+	else
+		return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
+			((u64)p[5] << 40) | ((u64)p[4] << 32) |
+			((u64)p[3] << 24) | ((u64)p[2] << 16) |
+			((u64)p[1] << 8) | p[0];
+}
+
+static machine_word_t
+get_unaligned_leword(const u8 *p)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return get_unaligned_le32(p);
+	else
+		return get_unaligned_le64(p);
+}
+
+/* Unaligned stores with endianness conversion */
+
+static void
+put_unaligned_le16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(le16_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+	}
+}
+
+static void
+put_unaligned_be16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(be16_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 8);
+		p[1] = (u8)(v >> 0);
+	}
+}
+
+static void
+put_unaligned_le32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(le32_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+	}
+}
+
+static void
+put_unaligned_be32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(be32_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 24);
+		p[1] = (u8)(v >> 16);
+		p[2] = (u8)(v >> 8);
+		p[3] = (u8)(v >> 0);
+	}
+}
+
+static void
+put_unaligned_le64(u64 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u64_unaligned(le64_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+		p[4] = (u8)(v >> 32);
+		p[5] = (u8)(v >> 40);
+		p[6] = (u8)(v >> 48);
+		p[7] = (u8)(v >> 56);
+	}
+}
+
+static void
+put_unaligned_leword(machine_word_t v, u8 *p)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		put_unaligned_le32(v, p);
+	else
+		put_unaligned_le64(v, p);
+}
+
+/* ========================================================================== */
+/*                         Bit manipulation functions                         */
+/* ========================================================================== */
+
+/*
+ * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
+ * significant end) of the *most* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static unsigned
+bsr32(u32 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_clz)
+	return 31 - __builtin_clz(v);
+#elif defined(_MSC_VER)
+	unsigned long i;
+
+	_BitScanReverse(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	while ((v >>= 1) != 0)
+		i++;
+	return i;
+#endif
+}
+
+static unsigned
+bsr64(u64 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
+	return 63 - __builtin_clzll(v);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanReverse64(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	while ((v >>= 1) != 0)
+		i++;
+	return i;
+#endif
+}
+
+static unsigned
+bsrw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsr32(v);
+	else
+		return bsr64(v);
+}
+
+/*
+ * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
+ * significant end) of the *least* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static unsigned
+bsf32(u32 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_ctz)
+	return __builtin_ctz(v);
+#elif defined(_MSC_VER)
+	unsigned long i;
+
+	_BitScanForward(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	for (; (v & 1) == 0; v >>= 1)
+		i++;
+	return i;
+#endif
+}
+
+static unsigned
+bsf64(u64 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_ctzll)
+	return __builtin_ctzll(v);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanForward64(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	for (; (v & 1) == 0; v >>= 1)
+		i++;
+	return i;
+#endif
+}
+
+static unsigned
+bsfw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsf32(v);
+	else
+		return bsf64(v);
+}
+
+/*
+ * rbit32(v): reverse the bits in a 32-bit integer.  This doesn't have a
+ * fallback implementation; use '#ifdef rbit32' to check if this is available.
+ */
+#undef rbit32
+#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \
+	(__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__)))
+static u32
+rbit32(u32 v)
+{
+	__asm__("rbit %0, %1" : "=r" (v) : "r" (v));
+	return v;
+}
+#define rbit32 rbit32
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64)
+static u32
+rbit32(u32 v)
+{
+	__asm__("rbit %w0, %w1" : "=r" (v) : "r" (v));
+	return v;
+}
+#define rbit32 rbit32
+#endif
+
+#endif /* COMMON_DEFS_H */
diff --git a/packages/wasm/lib/crypto/COPYING.lesser b/packages/wasm/lib/crypto/COPYING.lesser
new file mode 100644
index 00000000..153d416d
--- /dev/null
+++ b/packages/wasm/lib/crypto/COPYING.lesser
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
\ No newline at end of file
diff --git a/packages/crypto/aes256.c b/packages/wasm/lib/crypto/aes256.c
similarity index 100%
rename from packages/crypto/aes256.c
rename to packages/wasm/lib/crypto/aes256.c
diff --git a/packages/crypto/aes256.h b/packages/wasm/lib/crypto/aes256.h
similarity index 84%
rename from packages/crypto/aes256.h
rename to packages/wasm/lib/crypto/aes256.h
index c7af249b..c84ea460 100644
--- a/packages/crypto/aes256.h
+++ b/packages/wasm/lib/crypto/aes256.h
@@ -1,12 +1,11 @@
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
+#include "lib_common.h"
 
 #ifndef AES256_H
 #define AES256_H
 
 #define AES_BLOCK_SIZE 16
 #define EXPANDED_KEY_SIZE 60
+#define AES_EXPORT __attribute__((visibility("default")))
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/packages/wasm/lib/crypto/ctr256.c b/packages/wasm/lib/crypto/ctr256.c
new file mode 100644
index 00000000..71239e59
--- /dev/null
+++ b/packages/wasm/lib/crypto/ctr256.c
@@ -0,0 +1,55 @@
+#include "aes256.h"
+
+struct ctr256_ctx {
+    uint32_t expandedKey[EXPANDED_KEY_SIZE];
+    uint8_t* iv;
+    uint8_t state;
+};
+
+AES_EXPORT struct ctr256_ctx* ctr256_alloc(uint8_t* key, uint8_t* iv) {
+    struct ctr256_ctx *state = (struct ctr256_ctx *) __malloc(sizeof(struct ctr256_ctx));
+    aes256_set_encryption_key(key, state->expandedKey);
+    __free(key);
+
+    state->iv = iv;
+    state->state = 0;
+
+    return state;
+}
+
+AES_EXPORT void ctr256_free(struct ctr256_ctx* ctx) {
+    __free(ctx->iv);
+    __free(ctx);
+}
+
+AES_EXPORT void ctr256(struct ctr256_ctx* ctx, uint8_t* in, uint32_t length, uint8_t *out) {
+    uint8_t chunk[AES_BLOCK_SIZE];
+    uint32_t* expandedKey = ctx->expandedKey;
+    uint8_t* iv = ctx->iv;
+    uint8_t state = ctx->state;
+    uint32_t i, j, k;
+
+    aes256_encrypt(iv, chunk, expandedKey);
+
+    for (i = 0; i < length; i += AES_BLOCK_SIZE) {
+        for (j = 0; j < MIN(length - i, AES_BLOCK_SIZE); ++j) {
+            out[i + j] = in[i + j] ^ chunk[state++];
+
+            if (state >= AES_BLOCK_SIZE)
+                state = 0;
+
+            if (state == 0) {
+                k = AES_BLOCK_SIZE;
+                while(k--)
+                    if (++iv[k])
+                        break;
+
+                aes256_encrypt(iv, chunk, expandedKey);
+            }
+        }
+    }
+
+    __free(in);
+
+    ctx->state = state;
+}
\ No newline at end of file
diff --git a/packages/wasm/lib/crypto/ctr256.h b/packages/wasm/lib/crypto/ctr256.h
new file mode 100644
index 00000000..9c984a09
--- /dev/null
+++ b/packages/wasm/lib/crypto/ctr256.h
@@ -0,0 +1,6 @@
+#ifndef CTR256_H
+#define CTR256_H
+
+uint8_t *ctr256(const uint8_t in[], uint32_t length, const uint8_t key[32], uint8_t iv[16], uint8_t *state);
+
+#endif
\ No newline at end of file
diff --git a/packages/crypto/ige256.c b/packages/wasm/lib/crypto/ige256.c
similarity index 84%
rename from packages/crypto/ige256.c
rename to packages/wasm/lib/crypto/ige256.c
index 1e1cc731..6bc054ed 100644
--- a/packages/crypto/ige256.c
+++ b/packages/wasm/lib/crypto/ige256.c
@@ -1,6 +1,6 @@
 #include "aes256.h"
 
-void ige256_encrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) {
+AES_EXPORT void ige256_encrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) {
     uint32_t expandedKey[EXPANDED_KEY_SIZE];
     uint32_t i, j;
 
@@ -29,7 +29,7 @@ void ige256_encrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uin
     }
 }
 
-void ige256_decrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) {
+AES_EXPORT void ige256_decrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) {
     uint32_t expandedKey[EXPANDED_KEY_SIZE];
     uint32_t i, j;
 
diff --git a/packages/crypto/ige256.h b/packages/wasm/lib/crypto/ige256.h
similarity index 100%
rename from packages/crypto/ige256.h
rename to packages/wasm/lib/crypto/ige256.h
diff --git a/packages/wasm/lib/lib_common.h b/packages/wasm/lib/lib_common.h
new file mode 100644
index 00000000..8bf32b1c
--- /dev/null
+++ b/packages/wasm/lib/lib_common.h
@@ -0,0 +1,62 @@
+/*
+ * lib_common.h - internal header included by all library code
+ */
+
+#ifndef LIB_LIB_COMMON_H
+#define LIB_LIB_COMMON_H
+
+#ifdef LIBDEFLATE_H
+ /*
+  * When building the library, LIBDEFLATEAPI needs to be defined properly before
+  * including libdeflate.h.
+  */
+#  error "lib_common.h must always be included before libdeflate.h"
+#endif
+
+#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#  define LIBDEFLATE_EXPORT_SYM  __declspec(dllexport)
+#elif defined(__GNUC__)
+#  define LIBDEFLATE_EXPORT_SYM  __attribute__((visibility("default")))
+#else
+#  define LIBDEFLATE_EXPORT_SYM
+#endif
+
+/*
+ * On i386, gcc assumes that the stack is 16-byte aligned at function entry.
+ * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi)
+ * only guarantee 4-byte alignment when calling functions.  This is mainly an
+ * issue on Windows, but it has been seen on Linux too.  Work around this ABI
+ * incompatibility by realigning the stack pointer when entering libdeflate.
+ * This prevents crashes in SSE/AVX code.
+ */
+#if defined(__GNUC__) && defined(__i386__)
+#  define LIBDEFLATE_ALIGN_STACK  __attribute__((force_align_arg_pointer))
+#else
+#  define LIBDEFLATE_ALIGN_STACK
+#endif
+
+#define LIBDEFLATEAPI	LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK
+
+#include "common_defs.h"
+
+extern void* __malloc(size_t size);
+extern void __free(void* ptr);
+
+void *libdeflate_aligned_malloc(size_t alignment, size_t size);
+void libdeflate_aligned_free(void *ptr);
+
+#define ASSERT(expr) (void)(expr)
+#define CONCAT_IMPL(a, b)	a##b
+#define CONCAT(a, b)		CONCAT_IMPL(a, b)
+#define ADD_SUFFIX(name)	CONCAT(name, SUFFIX)
+
+#ifdef LOGGING
+void __debug(char* str);
+
+#define DEBUG(str) __debug(str);
+
+#else
+#define DEBUG(str)
+#endif
+
+#endif /* LIB_LIB_COMMON_H */
diff --git a/packages/wasm/lib/libdeflate.h b/packages/wasm/lib/libdeflate.h
new file mode 100644
index 00000000..1ac01833
--- /dev/null
+++ b/packages/wasm/lib/libdeflate.h
@@ -0,0 +1,245 @@
+/*
+ * libdeflate.h - public header for libdeflate
+ */
+
+#ifndef LIBDEFLATE_H
+#define LIBDEFLATE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBDEFLATE_VERSION_MAJOR	1
+#define LIBDEFLATE_VERSION_MINOR	19
+#define LIBDEFLATE_VERSION_STRING	"1.19"
+
+/*
+ * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause
+ * __declspec(dllimport) to be used.  This should be done when it's easy to do.
+ * Otherwise it's fine to skip it, since it is a very minor performance
+ * optimization that is irrelevant for most use cases of libdeflate.
+ */
+#ifndef LIBDEFLATEAPI
+#  if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#    define LIBDEFLATEAPI	__declspec(dllimport)
+#  else
+#    define LIBDEFLATEAPI
+#  endif
+#endif
+
+/* ========================================================================== */
+/*                             Compression                                    */
+/* ========================================================================== */
+
+struct libdeflate_compressor;
+struct libdeflate_options;
+
+/*
+ * libdeflate_alloc_compressor() allocates a new compressor that supports
+ * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
+ * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
+ * medium/default, 9 = slow, 12 = slowest).  Level 0 is also supported and means
+ * "no compression", specifically "create a valid stream, but only emit
+ * uncompressed blocks" (this will expand the data slightly).
+ *
+ * The return value is a pointer to the new compressor, or NULL if out of memory
+ * or if the compression level is invalid (i.e. outside the range [0, 12]).
+ *
+ * Note: for compression, the sliding window size is defined at compilation time
+ * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
+ * changed at runtime.
+ *
+ * A single compressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different compressors concurrently.
+ */
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level);
+
+/*
+ * Like libdeflate_alloc_compressor(), but adds the 'options' argument.
+ */
+//LIBDEFLATEAPI struct libdeflate_compressor *
+//libdeflate_alloc_compressor_ex(int compression_level,
+//			       const struct libdeflate_options *options);
+
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail);
+
+//LIBDEFLATEAPI size_t
+//libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
+//			       size_t in_nbytes);
+
+/*
+ * libdeflate_free_compressor() frees a compressor that was allocated with
+ * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
+ * taken.
+ */
+LIBDEFLATEAPI void
+libdeflate_free_compressor(struct libdeflate_compressor *compressor);
+
+/* ========================================================================== */
+/*                             Decompression                                  */
+/* ========================================================================== */
+
+struct libdeflate_decompressor;
+struct libdeflate_options;
+
+/*
+ * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
+ * for DEFLATE, zlib, and gzip decompression.  The return value is a pointer to
+ * the new decompressor, or NULL if out of memory.
+ *
+ * This function takes no parameters, and the returned decompressor is valid for
+ * decompressing data that was compressed at any compression level and with any
+ * sliding window size.
+ *
+ * A single decompressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different decompressors concurrently.
+ */
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void);
+
+/*
+ * Like libdeflate_alloc_decompressor(), but adds the 'options' argument.
+ */
+//LIBDEFLATEAPI struct libdeflate_decompressor *
+//libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options);
+
+/*
+ * Result of a call to libdeflate_deflate_decompress(),
+ * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
+ */
+enum libdeflate_result {
+	/* Decompression was successful.  */
+	LIBDEFLATE_SUCCESS = 0,
+
+	/* Decompression failed because the compressed data was invalid,
+	 * corrupt, or otherwise unsupported.  */
+	LIBDEFLATE_BAD_DATA = 1,
+
+	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
+	 * decompressed to fewer than 'out_nbytes_avail' bytes.  */
+	LIBDEFLATE_SHORT_OUTPUT = 2,
+
+	/* The data would have decompressed to more than 'out_nbytes_avail'
+	 * bytes.  */
+	LIBDEFLATE_INSUFFICIENT_SPACE = 3,
+};
+
+/*
+ * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer
+ * 'in' with compressed size up to 'in_nbytes' bytes.  The uncompressed data is
+ * written to 'out', a buffer with size 'out_nbytes_avail' bytes.  If
+ * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.  Otherwise,
+ * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the
+ * contents of the output buffer are undefined.
+ *
+ * Decompression stops at the end of the DEFLATE stream (as indicated by the
+ * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
+ *
+ * libdeflate_deflate_decompress() can be used in cases where the actual
+ * uncompressed size is known (recommended) or unknown (not recommended):
+ *
+ *   - If the actual uncompressed size is known, then pass the actual
+ *     uncompressed size as 'out_nbytes_avail' and pass NULL for
+ *     'actual_out_nbytes_ret'.  This makes libdeflate_deflate_decompress() fail
+ *     with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
+ *     specified number of bytes.
+ *
+ *   - If the actual uncompressed size is unknown, then provide a non-NULL
+ *     'actual_out_nbytes_ret' and provide a buffer with some size
+ *     'out_nbytes_avail' that you think is large enough to hold all the
+ *     uncompressed data.  In this case, if the data decompresses to less than
+ *     or equal to 'out_nbytes_avail' bytes, then
+ *     libdeflate_deflate_decompress() will write the actual uncompressed size
+ *     to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS).  Otherwise,
+ *     it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
+ *     not large enough but no other problems were encountered, or another
+ *     nonzero result code if decompression failed for another reason.
+ */
+//LIBDEFLATEAPI enum libdeflate_result
+//libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
+//			      const void *in, size_t in_nbytes,
+//			      void *out, size_t out_nbytes_avail,
+//			      size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
+ * then the actual compressed size of the DEFLATE stream (aligned to the next
+ * byte boundary) is written to *actual_in_nbytes_ret.
+ */
+enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
+ * instead of raw DEFLATE.
+ *
+ * If multiple gzip-compressed members are concatenated, then only the first
+ * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
+ * multi-member support.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail);
+			   
+
+/*
+ * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first gzip-compressed member in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+//LIBDEFLATEAPI enum libdeflate_result
+//libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
+//			      const void *in, size_t in_nbytes,
+//			      void *out, size_t out_nbytes_avail,
+//			      size_t *actual_in_nbytes_ret,
+//			      size_t *actual_out_nbytes_ret);
+
+/*
+ * libdeflate_free_decompressor() frees a decompressor that was allocated with
+ * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
+ * is taken.
+ */
+LIBDEFLATEAPI void
+libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
+
+/*
+ * Advanced options.  This is the options structure that
+ * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex()
+ * require.  Most users won't need this and should just use the non-"_ex"
+ * functions instead.  If you do need this, it should be initialized like this:
+ *
+ *	struct libdeflate_options options;
+ *
+ *	__builtin_memset(&options, 0, sizeof(options));
+ *	options.sizeof_options = sizeof(options);
+ *	// Then set the fields that you need to override the defaults for.
+ */
+struct libdeflate_options {
+	/*
+	 * This field must be set to the struct size.  This field exists for
+	 * extensibility, so that fields can be appended to this struct in
+	 * future versions of libdeflate while still supporting old binaries.
+	 */
+	size_t sizeof_options;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBDEFLATE_H */
diff --git a/packages/wasm/lib/libdeflate/COPYING b/packages/wasm/lib/libdeflate/COPYING
new file mode 100644
index 00000000..1f1b81cd
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/COPYING
@@ -0,0 +1,21 @@
+Copyright 2016 Eric Biggers
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/packages/wasm/lib/libdeflate/adler32.c b/packages/wasm/lib/libdeflate/adler32.c
new file mode 100644
index 00000000..9043000a
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/adler32.c
@@ -0,0 +1,123 @@
+/*
+ * adler32.c - Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+
+/* The Adler-32 divisor, or "base", value */
+#define DIVISOR 65521
+
+/*
+ * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
+ * of s2 overflowing when it is represented as an unsigned 32-bit integer.  This
+ * value was computed using the following Python script:
+ *
+ *	divisor = 65521
+ *	count = 0
+ *	s1 = divisor - 1
+ *	s2 = divisor - 1
+ *	while True:
+ *		s1 += 0xFF
+ *		s2 += s1
+ *		if s2 > 0xFFFFFFFF:
+ *			break
+ *		count += 1
+ *	print(count)
+ *
+ * Note that to get the correct worst-case value, we must assume that every byte
+ * has value 0xFF and that s1 and s2 started with the highest possible values
+ * modulo the divisor.
+ */
+#define MAX_CHUNK_LEN	5552
+
+static u32
+adler32_generic(u32 adler, const u8 *p, size_t len)
+{
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	const u8 * const end = p + len;
+
+	while (p != end) {
+		size_t chunk_len = MIN(end - p, MAX_CHUNK_LEN);
+		const u8 *chunk_end = p + chunk_len;
+		size_t num_unrolled_iterations = chunk_len / 4;
+
+		while (num_unrolled_iterations--) {
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+		}
+		while (p != chunk_end) {
+			s1 += *p++;
+			s2 += s1;
+		}
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+
+	return (s2 << 16) | s1;
+}
+
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_adler32_func
+typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
+
+#define DEFAULT_IMPL adler32_generic
+
+#ifdef arch_select_adler32_func
+static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
+
+static volatile adler32_func_t adler32_impl = dispatch_adler32;
+
+/* Choose the best implementation at runtime. */
+static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
+{
+	adler32_func_t f = arch_select_adler32_func();
+
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+
+	adler32_impl = f;
+	return f(adler, p, len);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#define adler32_impl DEFAULT_IMPL
+#endif
+
+u32
+libdeflate_adler32(u32 adler, const void *buffer, size_t len)
+{
+	if (buffer == NULL) /* Return initial value. */
+		return 1;
+	return adler32_impl(adler, buffer, len);
+}
diff --git a/packages/wasm/lib/libdeflate/adler32.h b/packages/wasm/lib/libdeflate/adler32.h
new file mode 100644
index 00000000..5468e14c
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/adler32.h
@@ -0,0 +1,8 @@
+#ifndef LIB_DEFLATE_ADLER32_H
+#define LIB_DEFLATE_ADLER32_H
+
+#include "lib_common.h"
+
+u32 libdeflate_adler32(u32 adler, const void *buffer, size_t len);
+
+#endif /* LIB_DEFLATE_ADLER32_H */
diff --git a/packages/wasm/lib/libdeflate/bt_matchfinder.h b/packages/wasm/lib/libdeflate/bt_matchfinder.h
new file mode 100644
index 00000000..7bc4f04d
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/bt_matchfinder.h
@@ -0,0 +1,342 @@
+/*
+ * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * This is a Binary Trees (bt) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * binary tree of sequences whose first 4 bytes share the same hash code.  Each
+ * sequence is identified by its starting position in the input buffer.  Each
+ * binary tree is always sorted such that each left child represents a sequence
+ * lexicographically lesser than its parent and each right child represents a
+ * sequence lexicographically greater than its parent.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, a new binary tree
+ * node is created to represent the current sequence.  Then, in a single tree
+ * traversal, the hash bucket's binary tree is searched for matches and is
+ * re-rooted at the new node.
+ *
+ * Compared to the simpler algorithm that uses linked lists instead of binary
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
+ * at each node visitation.  Ideally, the binary tree version will examine only
+ * 'log(n)' nodes to find the same matches that the linked list version will
+ * find by examining 'n' nodes.  In addition, the binary tree version can
+ * examine fewer bytes at each node by taking advantage of the common prefixes
+ * that result from the sort order, whereas the linked list version may have to
+ * examine up to the full length of the match at each node.
+ *
+ * However, it is not always best to use the binary tree version.  It requires
+ * nearly twice as much memory as the linked list version, and it takes time to
+ * keep the binary trees sorted, even at positions where the compressor does not
+ * need matches.  Generally, when doing fast compression on small buffers,
+ * binary trees are the wrong approach.  They are best suited for thorough
+ * compression and/or large buffers.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#ifndef LIB_BT_MATCHFINDER_H
+#define LIB_BT_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define BT_MATCHFINDER_HASH3_ORDER 16
+#define BT_MATCHFINDER_HASH3_WAYS  2
+#define BT_MATCHFINDER_HASH4_ORDER 16
+
+#define BT_MATCHFINDER_TOTAL_HASH_SIZE		\
+	(((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
+	  (1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
+
+/* Representation of a match found by the bt_matchfinder  */
+struct lz_match {
+
+	/* The number of bytes matched.  */
+	u16 length;
+
+	/* The offset back from the current position that was matched.  */
+	u16 offset;
+};
+
+struct MATCHFINDER_ALIGNED bt_matchfinder {
+
+	/* The hash table for finding length 3 matches  */
+	mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
+
+	/* The hash table which contains the roots of the binary trees for
+	 * finding length 4+ matches  */
+	mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
+
+	/* The child node references for the binary trees.  The left and right
+	 * children of the node for the sequence with position 'pos' are
+	 * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
+	mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
+};
+
+/* Prepare the matchfinder for a new input buffer.  */
+static void
+bt_matchfinder_init(struct bt_matchfinder *mf)
+{
+	STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE %
+		      MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE);
+}
+
+static void
+bt_matchfinder_slide_window(struct bt_matchfinder *mf)
+{
+	STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+static mf_pos_t *
+bt_left_child(struct bt_matchfinder *mf, s32 node)
+{
+	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
+}
+
+static mf_pos_t *
+bt_right_child(struct bt_matchfinder *mf, s32 node)
+{
+	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
+}
+
+/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
+ * and bt_matchfinder_skip_byte().  There must be sufficiently many bytes
+ * remaining to load a 32-bit integer from the *next* position.  */
+#define BT_MATCHFINDER_REQUIRED_NBYTES	5
+
+/* Advance the binary tree matchfinder by one byte, optionally recording
+ * matches.  @record_matches should be a compile-time constant.  */
+static struct lz_match *
+bt_matchfinder_advance_one_byte(struct bt_matchfinder * const mf,
+				const u8 * const in_base,
+				const ptrdiff_t cur_pos,
+				const u32 max_len,
+				const u32 nice_len,
+				const u32 max_search_depth,
+				u32 * const next_hashes,
+				struct lz_match *lz_matchptr,
+				const bool record_matches)
+{
+	const u8 *in_next = in_base + cur_pos;
+	u32 depth_remaining = max_search_depth;
+	const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+	u32 next_hashseq;
+	u32 hash3;
+	u32 hash4;
+	s32 cur_node;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+	s32 cur_node_2;
+#endif
+	const u8 *matchptr;
+	mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
+	u32 best_lt_len, best_gt_len;
+	u32 len;
+	u32 best_len = 3;
+
+	STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
+		      BT_MATCHFINDER_HASH3_WAYS <= 2);
+
+	next_hashseq = get_unaligned_le32(in_next + 1);
+
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+
+	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
+	prefetchw(&mf->hash3_tab[next_hashes[0]]);
+	prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+	cur_node = mf->hash3_tab[hash3][0];
+	mf->hash3_tab[hash3][0] = cur_pos;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+	cur_node_2 = mf->hash3_tab[hash3][1];
+	mf->hash3_tab[hash3][1] = cur_node;
+#endif
+	if (record_matches && cur_node > cutoff) {
+		u32 seq3 = load_u24_unaligned(in_next);
+		if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
+			lz_matchptr->length = 3;
+			lz_matchptr->offset = in_next - &in_base[cur_node];
+			lz_matchptr++;
+		}
+	#if BT_MATCHFINDER_HASH3_WAYS >= 2
+		else if (cur_node_2 > cutoff &&
+			seq3 == load_u24_unaligned(&in_base[cur_node_2]))
+		{
+			lz_matchptr->length = 3;
+			lz_matchptr->offset = in_next - &in_base[cur_node_2];
+			lz_matchptr++;
+		}
+	#endif
+	}
+
+	cur_node = mf->hash4_tab[hash4];
+	mf->hash4_tab[hash4] = cur_pos;
+
+	pending_lt_ptr = bt_left_child(mf, cur_pos);
+	pending_gt_ptr = bt_right_child(mf, cur_pos);
+
+	if (cur_node <= cutoff) {
+		*pending_lt_ptr = MATCHFINDER_INITVAL;
+		*pending_gt_ptr = MATCHFINDER_INITVAL;
+		return lz_matchptr;
+	}
+
+	best_lt_len = 0;
+	best_gt_len = 0;
+	len = 0;
+
+	for (;;) {
+		matchptr = &in_base[cur_node];
+
+		if (matchptr[len] == in_next[len]) {
+			len = lz_extend(in_next, matchptr, len + 1, max_len);
+			if (!record_matches || len > best_len) {
+				if (record_matches) {
+					best_len = len;
+					lz_matchptr->length = len;
+					lz_matchptr->offset = in_next - matchptr;
+					lz_matchptr++;
+				}
+				if (len >= nice_len) {
+					*pending_lt_ptr = *bt_left_child(mf, cur_node);
+					*pending_gt_ptr = *bt_right_child(mf, cur_node);
+					return lz_matchptr;
+				}
+			}
+		}
+
+		if (matchptr[len] < in_next[len]) {
+			*pending_lt_ptr = cur_node;
+			pending_lt_ptr = bt_right_child(mf, cur_node);
+			cur_node = *pending_lt_ptr;
+			best_lt_len = len;
+			if (best_gt_len < len)
+				len = best_gt_len;
+		} else {
+			*pending_gt_ptr = cur_node;
+			pending_gt_ptr = bt_left_child(mf, cur_node);
+			cur_node = *pending_gt_ptr;
+			best_gt_len = len;
+			if (best_lt_len < len)
+				len = best_lt_len;
+		}
+
+		if (cur_node <= cutoff || !--depth_remaining) {
+			*pending_lt_ptr = MATCHFINDER_INITVAL;
+			*pending_gt_ptr = MATCHFINDER_INITVAL;
+			return lz_matchptr;
+		}
+	}
+}
+
+/*
+ * Retrieve a list of matches with the current position.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
+ * @cur_pos
+ *	The current position in the input buffer relative to @in_base (the
+ *	position of the sequence being matched against).
+ * @max_len
+ *	The maximum permissible match length at this position.  Must be >=
+ *	BT_MATCHFINDER_REQUIRED_NBYTES.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + 1.
+ * @lz_matchptr
+ *	An array in which this function will record the matches.  The recorded
+ *	matches will be sorted by strictly increasing length and (non-strictly)
+ *	increasing offset.  The maximum number of matches that may be found is
+ *	'nice_len - 2'.
+ *
+ * The return value is a pointer to the next available slot in the @lz_matchptr
+ * array.  (If no matches were found, this will be the same as @lz_matchptr.)
+ */
+static struct lz_match *
+bt_matchfinder_get_matches(struct bt_matchfinder *mf,
+			   const u8 *in_base,
+			   ptrdiff_t cur_pos,
+			   u32 max_len,
+			   u32 nice_len,
+			   u32 max_search_depth,
+			   u32 next_hashes[2],
+			   struct lz_match *lz_matchptr)
+{
+	return bt_matchfinder_advance_one_byte(mf,
+					       in_base,
+					       cur_pos,
+					       max_len,
+					       nice_len,
+					       max_search_depth,
+					       next_hashes,
+					       lz_matchptr,
+					       true);
+}
+
+/*
+ * Advance the matchfinder, but don't record any matches.
+ *
+ * This is very similar to bt_matchfinder_get_matches() because both functions
+ * must do hashing and tree re-rooting.
+ */
+static void
+bt_matchfinder_skip_byte(struct bt_matchfinder *mf,
+			 const u8 *in_base,
+			 ptrdiff_t cur_pos,
+			 u32 nice_len,
+			 u32 max_search_depth,
+			 u32 next_hashes[2])
+{
+	bt_matchfinder_advance_one_byte(mf,
+					in_base,
+					cur_pos,
+					nice_len,
+					nice_len,
+					max_search_depth,
+					next_hashes,
+					NULL,
+					false);
+}
+
+#endif /* LIB_BT_MATCHFINDER_H */
diff --git a/packages/wasm/lib/libdeflate/decompress_template.h b/packages/wasm/lib/libdeflate/decompress_template.h
new file mode 100644
index 00000000..ac1987f3
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/decompress_template.h
@@ -0,0 +1,777 @@
+/*
+ * decompress_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This is the actual DEFLATE decompression routine, lifted out of
+ * deflate_decompress.c so that it can be compiled multiple times with different
+ * target instruction sets.
+ */
+
+#ifndef ATTRIBUTES
+#  define ATTRIBUTES
+#endif
+#ifndef EXTRACT_VARBITS
+#  define EXTRACT_VARBITS(word, count)	((word) & BITMASK(count))
+#endif
+#ifndef EXTRACT_VARBITS8
+#  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
+#endif
+
+static enum libdeflate_result ATTRIBUTES
+FUNCNAME(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	u8 *out_next = out;
+	u8 * const out_end = out_next + out_nbytes_avail;
+	u8 * const out_fastloop_end =
+		out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+
+	/* Input bitstream state; see deflate_decompress.c for documentation */
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	const u8 * const in_fastloop_end =
+		in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
+	bitbuf_t bitbuf = 0;
+	bitbuf_t saved_bitbuf;
+	u32 bitsleft = 0;
+	size_t overread_count = 0;
+
+	bool is_final_block;
+	unsigned block_type;
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+	bitbuf_t litlen_tablemask;
+	u32 entry;
+
+next_block:
+	/* Starting to read the next block */
+	;
+
+	STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+	REFILL_BITS();
+
+	/* BFINAL: 1 bit */
+	is_final_block = bitbuf & BITMASK(1);
+
+	/* BTYPE: 2 bits */
+	block_type = (bitbuf >> 1) & BITMASK(2);
+
+	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+		/* Dynamic Huffman block */
+
+		/* The order in which precode lengths are stored */
+		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+		};
+
+		unsigned num_explicit_precode_lens;
+		unsigned i;
+
+		/* Read the codeword length counts. */
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+		num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+		num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+		num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
+
+		d->static_codes_loaded = false;
+
+		/*
+		 * Read the precode codeword lengths.
+		 *
+		 * A 64-bit bitbuffer is just one bit too small to hold the
+		 * maximum number of precode lens, so to minimize branches we
+		 * merge one len with the previous fields.
+		 */
+		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+		if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+				(bitbuf >> 17) & BITMASK(3);
+			bitbuf >>= 20;
+			bitsleft -= 20;
+			REFILL_BITS();
+			i = 1;
+			do {
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		} else {
+			bitbuf >>= 17;
+			bitsleft -= 17;
+			i = 0;
+			do {
+				if ((u8)bitsleft < 3)
+					REFILL_BITS();
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		}
+		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+		/* Build the decode table for the precode. */
+		SAFETY_CHECK(build_precode_decode_table(d));
+
+		/* Decode the litlen and offset codeword lengths. */
+		i = 0;
+		do {
+			unsigned presym;
+			u8 rep_val;
+			unsigned rep_count;
+
+			if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+				REFILL_BITS();
+
+			/*
+			 * The code below assumes that the precode decode table
+			 * doesn't have any subtables.
+			 */
+			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+			/* Decode the next precode symbol. */
+			entry = d->u.l.precode_decode_table[
+				bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry; /* optimization: subtract full entry */
+			presym = entry >> 16;
+
+			if (presym < 16) {
+				/* Explicit codeword length */
+				d->u.l.lens[i++] = presym;
+				continue;
+			}
+
+			/* Run-length encoded codeword lengths */
+
+			/*
+			 * Note: we don't need to immediately verify that the
+			 * repeat count doesn't overflow the number of elements,
+			 * since we've sized the lens array to have enough extra
+			 * space to allow for the worst-case overrun (138 zeroes
+			 * when only 1 length was remaining).
+			 *
+			 * In the case of the small repeat counts (presyms 16
+			 * and 17), it is fastest to always write the maximum
+			 * number of entries.  That gets rid of branches that
+			 * would otherwise be required.
+			 *
+			 * It is not just because of the numerical order that
+			 * our checks go in the order 'presym < 16', 'presym ==
+			 * 16', and 'presym == 17'.  For typical data this is
+			 * ordered from most frequent to least frequent case.
+			 */
+			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+
+			if (presym == 16) {
+				/* Repeat the previous length 3 - 6 times. */
+				SAFETY_CHECK(i != 0);
+				rep_val = d->u.l.lens[i - 1];
+				STATIC_ASSERT(3 + BITMASK(2) == 6);
+				rep_count = 3 + (bitbuf & BITMASK(2));
+				bitbuf >>= 2;
+				bitsleft -= 2;
+				d->u.l.lens[i + 0] = rep_val;
+				d->u.l.lens[i + 1] = rep_val;
+				d->u.l.lens[i + 2] = rep_val;
+				d->u.l.lens[i + 3] = rep_val;
+				d->u.l.lens[i + 4] = rep_val;
+				d->u.l.lens[i + 5] = rep_val;
+				i += rep_count;
+			} else if (presym == 17) {
+				/* Repeat zero 3 - 10 times. */
+				STATIC_ASSERT(3 + BITMASK(3) == 10);
+				rep_count = 3 + (bitbuf & BITMASK(3));
+				bitbuf >>= 3;
+				bitsleft -= 3;
+				d->u.l.lens[i + 0] = 0;
+				d->u.l.lens[i + 1] = 0;
+				d->u.l.lens[i + 2] = 0;
+				d->u.l.lens[i + 3] = 0;
+				d->u.l.lens[i + 4] = 0;
+				d->u.l.lens[i + 5] = 0;
+				d->u.l.lens[i + 6] = 0;
+				d->u.l.lens[i + 7] = 0;
+				d->u.l.lens[i + 8] = 0;
+				d->u.l.lens[i + 9] = 0;
+				i += rep_count;
+			} else {
+				/* Repeat zero 11 - 138 times. */
+				STATIC_ASSERT(11 + BITMASK(7) == 138);
+				rep_count = 11 + (bitbuf & BITMASK(7));
+				bitbuf >>= 7;
+				bitsleft -= 7;
+				__builtin_memset(&d->u.l.lens[i], 0,
+				       rep_count * sizeof(d->u.l.lens[i]));
+				i += rep_count;
+			}
+		} while (i < num_litlen_syms + num_offset_syms);
+
+		/* Unnecessary, but check this for consistency with zlib. */
+		SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
+
+	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+		u16 len, nlen;
+
+		/*
+		 * Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.
+		 */
+
+		bitsleft -= 3; /* for BTYPE and BFINAL */
+
+		/*
+		 * Align the bitstream to the next byte boundary.  This means
+		 * the next byte boundary as if we were reading a byte at a
+		 * time.  Therefore, we have to rewind 'in_next' by any bytes
+		 * that have been refilled but not actually consumed yet (not
+		 * counting overread bytes, which don't increment 'in_next').
+		 */
+		bitsleft = (u8)bitsleft;
+		SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+		in_next -= (bitsleft >> 3) - overread_count;
+		overread_count = 0;
+		bitbuf = 0;
+		bitsleft = 0;
+
+		SAFETY_CHECK(in_end - in_next >= 4);
+		len = get_unaligned_le16(in_next);
+		nlen = get_unaligned_le16(in_next + 2);
+		in_next += 4;
+
+		SAFETY_CHECK(len == (u16)~nlen);
+		if (unlikely(len > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+		SAFETY_CHECK(len <= in_end - in_next);
+
+		__builtin_memcpy(out_next, in_next, len);
+		in_next += len;
+		out_next += len;
+
+		goto block_done;
+
+	} else {
+		unsigned i;
+
+		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+
+		/*
+		 * Static Huffman block: build the decode tables for the static
+		 * codes.  Skip doing so if the tables are already set up from
+		 * an earlier static block; this speeds up decompression of
+		 * degenerate input of many empty or very short static blocks.
+		 *
+		 * Afterwards, the remainder is the same as decompressing a
+		 * dynamic Huffman block.
+		 */
+
+		bitbuf >>= 3; /* for BTYPE and BFINAL */
+		bitsleft -= 3;
+
+		if (d->static_codes_loaded)
+			goto have_decode_tables;
+
+		d->static_codes_loaded = true;
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+
+		for (i = 0; i < 144; i++)
+			d->u.l.lens[i] = 8;
+		for (; i < 256; i++)
+			d->u.l.lens[i] = 9;
+		for (; i < 280; i++)
+			d->u.l.lens[i] = 7;
+		for (; i < 288; i++)
+			d->u.l.lens[i] = 8;
+
+		for (; i < 288 + 32; i++)
+			d->u.l.lens[i] = 5;
+
+		num_litlen_syms = 288;
+		num_offset_syms = 32;
+	}
+
+	/* Decompressing a Huffman block (either dynamic or static) */
+
+	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+have_decode_tables:
+	litlen_tablemask = BITMASK(d->litlen_tablebits);
+
+	/*
+	 * This is the "fastloop" for decoding literals and matches.  It does
+	 * bounds checks on in_next and out_next in the loop conditions so that
+	 * additional bounds checks aren't needed inside the loop body.
+	 *
+	 * To reduce latency, the bitbuffer is refilled and the next litlen
+	 * decode table entry is preloaded before each loop iteration.
+	 */
+	if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+		goto generic_loop;
+	REFILL_BITS_IN_FASTLOOP();
+	entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+	do {
+		u32 length, offset, lit;
+		const u8 *src;
+		u8 *dst;
+
+		/*
+		 * Consume the bits for the litlen decode table entry.  Save the
+		 * original bitbuf for later, in case the extra match length
+		 * bits need to be extracted from it.
+		 */
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+
+		/*
+		 * Begin by checking for a "fast" literal, i.e. a literal that
+		 * doesn't need a subtable.
+		 */
+		if (entry & HUFFDEC_LITERAL) {
+			/*
+			 * On 64-bit platforms, we decode up to 2 extra fast
+			 * literals in addition to the primary item, as this
+			 * increases performance and still leaves enough bits
+			 * remaining for what follows.  We could actually do 3,
+			 * assuming LITLEN_TABLEBITS=11, but that actually
+			 * decreases performance slightly (perhaps by messing
+			 * with the branch prediction of the conditional refill
+			 * that happens later while decoding the match offset).
+			 *
+			 * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+			 * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+			 * number of extra literals decoded here is changed.
+			 */
+			if (/* enough bits for 2 fast literals + length + offset preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 LENGTH_MAXBITS,
+							 OFFSET_TABLEBITS) &&
+			    /* enough bits for 2 fast literals + slow literal + litlen preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							 LITLEN_TABLEBITS)) {
+				/* 1st extra fast literal */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				saved_bitbuf = bitbuf;
+				bitbuf >>= (u8)entry;
+				bitsleft -= entry;
+				*out_next++ = lit;
+				if (entry & HUFFDEC_LITERAL) {
+					/* 2nd extra fast literal */
+					lit = entry >> 16;
+					entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+					saved_bitbuf = bitbuf;
+					bitbuf >>= (u8)entry;
+					bitsleft -= entry;
+					*out_next++ = lit;
+					if (entry & HUFFDEC_LITERAL) {
+						/*
+						 * Another fast literal, but
+						 * this one is in lieu of the
+						 * primary item, so it doesn't
+						 * count as one of the extras.
+						 */
+						lit = entry >> 16;
+						entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+						REFILL_BITS_IN_FASTLOOP();
+						*out_next++ = lit;
+						continue;
+					}
+				}
+			} else {
+				/*
+				 * Decode a literal.  While doing so, preload
+				 * the next litlen decode table entry and refill
+				 * the bitbuffer.  To reduce latency, we've
+				 * arranged for there to be enough "preloadable"
+				 * bits remaining to do the table preload
+				 * independently of the refill.
+				 */
+				STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+						LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+		}
+
+		/*
+		 * It's not a literal entry, so it can be a length entry, a
+		 * subtable pointer entry, or an end-of-block entry.  Detect the
+		 * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+		 */
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			/* Subtable pointer or end-of-block entry */
+
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+
+			/*
+			 * A subtable is required.  Load and consume the
+			 * subtable entry.  The subtable entry can be of any
+			 * type: literal, length, or end-of-block.
+			 */
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+				EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+
+			/*
+			 * 32-bit platforms that use the byte-at-a-time refill
+			 * method have to do a refill here for there to always
+			 * be enough bits to decode a literal that requires a
+			 * subtable, then preload the next litlen decode table
+			 * entry; or to decode a match length that requires a
+			 * subtable, then preload the offset decode table entry.
+			 */
+			if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							  LITLEN_TABLEBITS) ||
+			    !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+							  OFFSET_TABLEBITS))
+				REFILL_BITS_IN_FASTLOOP();
+			if (entry & HUFFDEC_LITERAL) {
+				/* Decode a literal that required a subtable. */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+			/* Else, it's a length that required a subtable. */
+		}
+
+		/*
+		 * Decode the match length: the length base value associated
+		 * with the litlen symbol (which we extract from the decode
+		 * table entry), plus the extra length bits.  We don't need to
+		 * consume the extra length bits here, as they were included in
+		 * the bits consumed by the entry earlier.  We also don't need
+		 * to check for too-long matches here, as this is inside the
+		 * fastloop where it's already been verified that the output
+		 * buffer has enough space remaining to copy a max-length match.
+		 */
+		length = entry >> 16;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/*
+		 * Decode the match offset.  There are enough "preloadable" bits
+		 * remaining to preload the offset decode table entry, but a
+		 * refill might be needed before consuming it.
+		 */
+		STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+							   OFFSET_TABLEBITS));
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+						 LITLEN_TABLEBITS)) {
+			/*
+			 * Decoding a match offset on a 64-bit platform.  We may
+			 * need to refill once, but then we can decode the whole
+			 * offset and preload the next litlen table entry.
+			 */
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+					     LITLEN_TABLEBITS - PRELOAD_SLACK))
+					REFILL_BITS_IN_FASTLOOP();
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+					    LITLEN_TABLEBITS - PRELOAD_SLACK))
+				REFILL_BITS_IN_FASTLOOP();
+		} else {
+			/* Decoding a match offset on a 32-bit platform */
+			REFILL_BITS_IN_FASTLOOP();
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+				REFILL_BITS_IN_FASTLOOP();
+				/* No further refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(
+					OFFSET_MAXBITS - OFFSET_TABLEBITS));
+			} else {
+				/* No refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+			}
+		}
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/* Validate the match offset; needed even in the fastloop. */
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		/*
+		 * Before starting to issue the instructions to copy the match,
+		 * refill the bitbuffer and preload the litlen decode table
+		 * entry for the next loop iteration.  This can increase
+		 * performance by allowing the latency of the match copy to
+		 * overlap with these other operations.  To further reduce
+		 * latency, we've arranged for there to be enough bits remaining
+		 * to do the table preload independently of the refill, except
+		 * on 32-bit platforms using the byte-at-a-time refill method.
+		 */
+		if (!CAN_CONSUME_AND_THEN_PRELOAD(
+			MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+			    OFFSET_MAXFASTBITS),
+			LITLEN_TABLEBITS) &&
+		    unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+			REFILL_BITS_IN_FASTLOOP();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		REFILL_BITS_IN_FASTLOOP();
+
+		/*
+		 * Copy the match.  On most CPUs the fastest method is a
+		 * word-at-a-time copy, unconditionally copying about 5 words
+		 * since this is enough for most matches without being too much.
+		 *
+		 * The normal word-at-a-time copy works for offset >= WORDBYTES,
+		 * which is most cases.  The case of offset == 1 is also common
+		 * and is worth optimizing for, since it is just RLE encoding of
+		 * the previous byte, which is the result of compressing long
+		 * runs of the same byte.
+		 *
+		 * Writing past the match 'length' is allowed here, since it's
+		 * been ensured there is enough output space left for a slight
+		 * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+		 * the maximum possible overrun here is changed.
+		 */
+		if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+			machine_word_t v;
+
+			/*
+			 * This part tends to get auto-vectorized, so keep it
+			 * copying a multiple of 16 bytes at a time.
+			 */
+			v = (machine_word_t)0x0101010101010101 * src[0];
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			do {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+			} while (dst < out_next);
+		} else {
+			*dst++ = *src++;
+			*dst++ = *src++;
+			do {
+				*dst++ = *src++;
+			} while (dst < out_next);
+		}
+	} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+
+	/*
+	 * This is the generic loop for decoding literals and matches.  This
+	 * handles cases where in_next and out_next are close to the end of
+	 * their respective buffers.  Usually this loop isn't performance-
+	 * critical, as most time is spent in the fastloop above instead.  We
+	 * therefore omit some optimizations here in favor of smaller code.
+	 */
+generic_loop:
+	for (;;) {
+		u32 length, offset;
+		const u8 *src;
+		u8 *dst;
+
+		REFILL_BITS();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+		if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+		}
+		length = entry >> 16;
+		if (entry & HUFFDEC_LITERAL) {
+			if (unlikely(out_next == out_end))
+				return LIBDEFLATE_INSUFFICIENT_SPACE;
+			*out_next++ = length;
+			continue;
+		}
+		if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+			goto block_done;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+		if (unlikely(length > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+
+		if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+			REFILL_BITS();
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			bitbuf >>= OFFSET_TABLEBITS;
+			bitsleft -= OFFSET_TABLEBITS;
+			entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			if (!CAN_CONSUME(OFFSET_MAXBITS))
+				REFILL_BITS();
+		}
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+		*dst++ = *src++;
+		*dst++ = *src++;
+		do {
+			*dst++ = *src++;
+		} while (dst < out_next);
+	}
+
+block_done:
+	/* Finished decoding a block */
+
+	if (!is_final_block)
+		goto next_block;
+
+	/* That was the last block. */
+
+	bitsleft = (u8)bitsleft;
+
+	/*
+	 * If any of the implicit appended zero bytes were consumed (not just
+	 * refilled) before hitting end of stream, then the data is bad.
+	 */
+	SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+
+	/* Optionally return the actual number of bytes consumed. */
+	if (actual_in_nbytes_ret) {
+		/* Don't count bytes that were refilled but not consumed. */
+		in_next -= (bitsleft >> 3) - overread_count;
+
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+	}
+
+	/* Optionally return the actual number of bytes written. */
+	if (actual_out_nbytes_ret) {
+		*actual_out_nbytes_ret = out_next - (u8 *)out;
+	} else {
+		if (out_next != out_end)
+			return LIBDEFLATE_SHORT_OUTPUT;
+	}
+	return LIBDEFLATE_SUCCESS;
+}
+
+#undef FUNCNAME
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
diff --git a/packages/wasm/lib/libdeflate/deflate_compress.c b/packages/wasm/lib/libdeflate/deflate_compress.c
new file mode 100644
index 00000000..14b92d5a
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/deflate_compress.c
@@ -0,0 +1,4119 @@
+/*
+ * deflate_compress.c - a compressor for DEFLATE
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "deflate_constants.h"
+
+/******************************************************************************/
+
+/*
+ * The following parameters can be changed at build time to customize the
+ * compression algorithms slightly:
+ *
+ * (Note, not all customizable parameters are here.  Some others can be found in
+ * libdeflate_alloc_compressor() and in *_matchfinder.h.)
+ */
+
+/*
+ * If this parameter is defined to 1, then the near-optimal parsing algorithm
+ * will be included, and compression levels 10-12 will use it.  This algorithm
+ * usually produces a compression ratio significantly better than the other
+ * algorithms.  However, it is slow.  If this parameter is defined to 0, then
+ * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
+ */
+#define SUPPORT_NEAR_OPTIMAL_PARSING	1
+
+/*
+ * This is the minimum block length that the compressor will use, in
+ * uncompressed bytes.  This should be a value below which using shorter blocks
+ * is unlikely to be worthwhile, due to the per-block overhead.  This value does
+ * not apply to the final block, which may be shorter than this (if the input is
+ * shorter, it will have to be), or to the final uncompressed block in a series
+ * of uncompressed blocks that cover more than UINT16_MAX bytes.
+ *
+ * This value is also approximately the amount by which what would otherwise be
+ * the second-to-last block is allowed to grow past the soft maximum length in
+ * order to avoid having to use a very short final block.
+ *
+ * Defining a fixed minimum block length is needed in order to guarantee a
+ * reasonable upper bound on the compressed size.  It's also needed because our
+ * block splitting algorithm doesn't work well on very short blocks.
+ */
+#define MIN_BLOCK_LENGTH	5000
+
+/*
+ * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
+ * maximum block length, in uncompressed bytes.  The compressor will try to end
+ * blocks at this length, but it may go slightly past it if there is a match
+ * that straddles this limit or if the input data ends soon after this limit.
+ * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format
+ * limits to 65535 bytes.
+ *
+ * This should be a value above which it is very likely that splitting the block
+ * would produce a better compression ratio.  For the near-optimal compressor,
+ * increasing/decreasing this parameter will increase/decrease per-compressor
+ * memory usage linearly.
+ */
+#define SOFT_MAX_BLOCK_LENGTH	300000
+
+/*
+ * For the greedy, lazy, and lazy2 compressors: this is the length of the
+ * sequence store, which is an array where the compressor temporarily stores
+ * matches that it's going to use in the current block.  This value is the
+ * maximum number of matches that can be used in a block.  If the sequence store
+ * fills up, then the compressor will be forced to end the block early.  This
+ * value should be large enough so that this rarely happens, due to the block
+ * being ended normally before then.  Increasing/decreasing this value will
+ * increase/decrease per-compressor memory usage linearly.
+ */
+#define SEQ_STORE_LENGTH	50000
+
+/*
+ * For deflate_compress_fastest(): This is the soft maximum block length.
+ * deflate_compress_fastest() doesn't use the regular block splitting algorithm;
+ * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or
+ * FAST_SEQ_STORE_LENGTH matches.  Therefore, this value should be lower than
+ * the regular SOFT_MAX_BLOCK_LENGTH.
+ */
+#define FAST_SOFT_MAX_BLOCK_LENGTH	65535
+
+/*
+ * For deflate_compress_fastest(): this is the length of the sequence store.
+ * This is like SEQ_STORE_LENGTH, but this should be a lower value.
+ */
+#define FAST_SEQ_STORE_LENGTH	8192
+
+/*
+ * These are the maximum codeword lengths, in bits, the compressor will use for
+ * each Huffman code.  The DEFLATE format defines limits for these.  However,
+ * further limiting litlen codewords to 14 bits is beneficial, since it has
+ * negligible effect on compression ratio but allows some optimizations when
+ * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
+ */
+#define MAX_LITLEN_CODEWORD_LEN		14
+#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
+#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Parameters specific to the near-optimal parsing algorithm */
+
+/*
+ * BIT_COST is a scaling factor that allows the near-optimal compressor to
+ * consider fractional bit costs when deciding which literal/match sequence to
+ * use.  This is useful when the true symbol costs are unknown.  For example, if
+ * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its
+ * cost to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end
+ * each symbol will use a whole number of bits due to the Huffman coding,
+ * considering fractional bits can be helpful due to the limited information.
+ *
+ * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
+ * value isn't very useful since the calculations are approximate anyway.
+ *
+ * BIT_COST doesn't apply to deflate_flush_block() and
+ * deflate_compute_true_cost(), which consider whole bits.
+ */
+#define BIT_COST	16
+
+/*
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
+ * be needed to output a symbol that was unused in the previous optimization
+ * pass.  Assigning a default cost allows the symbol to be used in the next
+ * optimization pass.  However, the cost should be relatively high because the
+ * symbol probably won't be used very many times (if at all).
+ */
+#define LITERAL_NOSTAT_BITS	13
+#define LENGTH_NOSTAT_BITS	13
+#define OFFSET_NOSTAT_BITS	10
+
+/*
+ * This is (slightly less than) the maximum number of matches that the
+ * near-optimal compressor will cache per block.  This behaves similarly to
+ * SEQ_STORE_LENGTH for the other compressors.
+ */
+#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/******************************************************************************/
+
+/* Include the needed matchfinders. */
+#define MATCHFINDER_WINDOW_ORDER	DEFLATE_WINDOW_ORDER
+#include "hc_matchfinder.h"
+#include "ht_matchfinder.h"
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+#  include "bt_matchfinder.h"
+/*
+ * This is the maximum number of matches the binary trees matchfinder can find
+ * at a single position.  Since the matchfinder never finds more than one match
+ * for the same length, presuming one of each possible length is sufficient for
+ * an upper bound.  (This says nothing about whether it is worthwhile to
+ * consider so many matches; this is just defining the worst case.)
+ */
+#define MAX_MATCHES_PER_POS	\
+	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+#endif
+
+/*
+ * The largest block length we will ever use is when the final block is of
+ * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
+ * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
+ * occurs when the lazy2 compressor chooses two literals and a maximum-length
+ * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
+ */
+#define MAX_BLOCK_LENGTH	\
+	MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,	\
+	    SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
+
+static void
+check_buildtime_parameters(void)
+{
+	/*
+	 * Verify that MIN_BLOCK_LENGTH is being honored, as
+	 * libdeflate_deflate_compress_bound() depends on it.
+	 */
+	STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
+		      MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
+		      MIN_BLOCK_LENGTH);
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <=
+		      MATCH_CACHE_LENGTH);
+#endif
+
+	/* The definition of MAX_BLOCK_LENGTH assumes this. */
+	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
+
+	/* Verify that the sequence stores aren't uselessly large. */
+	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
+		      SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
+		      FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+
+	/* Verify that the maximum codeword lengths are valid. */
+	STATIC_ASSERT(
+		MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+	STATIC_ASSERT(
+		MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+	STATIC_ASSERT(
+		MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
+	STATIC_ASSERT(
+		(1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
+	STATIC_ASSERT(
+		(1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
+	STATIC_ASSERT(
+		(1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
+}
+
+/******************************************************************************/
+
+/* Table: length slot => length slot base value */
+static const unsigned deflate_length_slot_base[] = {
+	3,    4,    5,    6,    7,    8,    9,    10,
+	11,   13,   15,   17,   19,   23,   27,   31,
+	35,   43,   51,   59,   67,   83,   99,   115,
+	131,  163,  195,  227,  258,
+};
+
+/* Table: length slot => number of extra length bits */
+static const u8 deflate_extra_length_bits[] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	1,    1,    1,    1,    2,    2,    2,    2,
+	3,    3,    3,    3,    4,    4,    4,    4,
+	5,    5,    5,    5,    0,
+};
+
+/* Table: offset slot => offset slot base value */
+static const unsigned deflate_offset_slot_base[] = {
+	1,     2,     3,     4,     5,     7,     9,     13,
+	17,    25,    33,    49,    65,    97,    129,   193,
+	257,   385,   513,   769,   1025,  1537,  2049,  3073,
+	4097,  6145,  8193,  12289, 16385, 24577,
+};
+
+/* Table: offset slot => number of extra offset bits */
+static const u8 deflate_extra_offset_bits[] = {
+	0,     0,     0,     0,     1,     1,     2,     2,
+	3,     3,     4,     4,     5,     5,     6,     6,
+	7,     7,     8,     8,     9,     9,     10,    10,
+	11,    11,    12,    12,    13,    13,
+};
+
+/* Table: length => length slot */
+static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
+	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
+	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
+	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 28,
+};
+
+/*
+ * Table: 'offset - 1 => offset_slot' for offset <= 256.
+ * This was generated by scripts/gen_offset_slot_map.py.
+ */
+static const u8 deflate_offset_slot[256] = {
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+};
+
+/* The order in which precode codeword lengths are stored */
+static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+};
+
+/* Table: precode symbol => number of extra bits */
+static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7
+};
+
+/* Codewords for the DEFLATE Huffman codes */
+struct deflate_codewords {
+	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * Codeword lengths (in bits) for the DEFLATE Huffman codes.
+ * A zero length means the corresponding symbol had zero frequency.
+ */
+struct deflate_lens {
+	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/* Codewords and lengths for the DEFLATE Huffman codes */
+struct deflate_codes {
+	struct deflate_codewords codewords;
+	struct deflate_lens lens;
+};
+
+/* Symbol frequency counters for the DEFLATE Huffman codes */
+struct deflate_freqs {
+	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * Represents a run of literals followed by a match or end-of-block.  This
+ * struct is needed to temporarily store items chosen by the parser, since items
+ * cannot be written until all items for the block have been chosen and the
+ * block's Huffman codes have been computed.
+ */
+struct deflate_sequence {
+
+	/*
+	 * Bits 0..22: the number of literals in this run.  This may be 0 and
+	 * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
+	 * explicitly in this structure; instead, they are read directly from
+	 * the uncompressed data.
+	 *
+	 * Bits 23..31: the length of the match which follows the literals, or 0
+	 * if this literal run was the last in the block, so there is no match
+	 * which follows it.
+	 */
+#define SEQ_LENGTH_SHIFT 23
+#define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1)
+	u32 litrunlen_and_length;
+
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the offset of
+	 * the match which follows the literals.
+	 */
+	u16 offset;
+
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the offset
+	 * slot of the match which follows the literals.
+	 */
+	u16 offset_slot;
+};
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Costs for the near-optimal parsing algorithm */
+struct deflate_costs {
+
+	/* The cost to output each possible literal */
+	u32 literal[DEFLATE_NUM_LITERALS];
+
+	/* The cost to output each possible match length */
+	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
+
+	/* The cost to output a match offset of each possible offset slot */
+	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * This structure represents a byte position in the input data and a node in the
+ * graph of possible match/literal choices for the current block.
+ *
+ * Logically, each incoming edge to this node is labeled with a literal or a
+ * match that can be taken to reach this position from an earlier position; and
+ * each outgoing edge from this node is labeled with a literal or a match that
+ * can be taken to advance from this position to a later position.
+ *
+ * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
+ * associate with each node just two pieces of information:
+ *
+ *	'cost_to_end' is the minimum cost to reach the end of the block from
+ *	this position.
+ *
+ *	'item' represents the literal or match that must be chosen from here to
+ *	reach the end of the block with the minimum cost.  Equivalently, this
+ *	can be interpreted as the label of the outgoing edge on the minimum-cost
+ *	path to the "end of block" node from this node.
+ */
+struct deflate_optimum_node {
+
+	u32 cost_to_end;
+
+	/*
+	 * Notes on the match/literal representation used here:
+	 *
+	 *	The low bits of 'item' are the length: 1 if this is a literal,
+	 *	or the match length if this is a match.
+	 *
+	 *	The high bits of 'item' are the actual literal byte if this is a
+	 *	literal, or the match offset if this is a match.
+	 */
+#define OPTIMUM_OFFSET_SHIFT 9
+#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
+	u32 item;
+
+};
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* Block split statistics.  See "Block splitting algorithm" below. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
+			       NUM_MATCH_OBSERVATION_TYPES)
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
+struct block_split_stats {
+	u32 new_observations[NUM_OBSERVATION_TYPES];
+	u32 observations[NUM_OBSERVATION_TYPES];
+	u32 num_new_observations;
+	u32 num_observations;
+};
+
+struct deflate_output_bitstream;
+
+/* The main DEFLATE compressor structure */
+struct libdeflate_compressor {
+
+	/* Pointer to the compress() implementation chosen at allocation time */
+	void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
+		     size_t in_nbytes, struct deflate_output_bitstream *os);
+
+	/* The compression level with which this compressor was created */
+	unsigned compression_level;
+
+	/* Anything of this size or less we won't bother trying to compress. */
+	size_t max_passthrough_size;
+
+	/*
+	 * The maximum search depth: consider at most this many potential
+	 * matches at each position
+	 */
+	unsigned max_search_depth;
+
+	/*
+	 * The "nice" match length: if a match of this length is found, choose
+	 * it immediately without further consideration
+	 */
+	unsigned nice_match_length;
+
+	/* Frequency counters for the current block */
+	struct deflate_freqs freqs;
+
+	/* Block split statistics for the current block */
+	struct block_split_stats split_stats;
+
+	/* Dynamic Huffman codes for the current block */
+	struct deflate_codes codes;
+
+	/* The static Huffman codes defined by the DEFLATE format */
+	struct deflate_codes static_codes;
+
+	/* Temporary space for block flushing */
+	union {
+		/* Information about the precode */
+		struct {
+			u32 freqs[DEFLATE_NUM_PRECODE_SYMS];
+			u32 codewords[DEFLATE_NUM_PRECODE_SYMS];
+			u8 lens[DEFLATE_NUM_PRECODE_SYMS];
+			unsigned items[DEFLATE_NUM_LITLEN_SYMS +
+				       DEFLATE_NUM_OFFSET_SYMS];
+			unsigned num_litlen_syms;
+			unsigned num_offset_syms;
+			unsigned num_explicit_lens;
+			unsigned num_items;
+		} precode;
+		/*
+		 * The "full" length codewords.  Used only after the information
+		 * in 'precode' is no longer needed.
+		 */
+		struct {
+			u32 codewords[DEFLATE_MAX_MATCH_LEN + 1];
+			u8 lens[DEFLATE_MAX_MATCH_LEN + 1];
+		} length;
+	} o;
+
+	union {
+		/* Data for greedy or lazy parsing */
+		struct {
+			/* Hash chains matchfinder */
+			struct hc_matchfinder hc_mf;
+
+			/* Matches and literals chosen for the current block */
+			struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
+
+		} g; /* (g)reedy */
+
+		/* Data for fastest parsing */
+		struct {
+			/* Hash table matchfinder */
+			struct ht_matchfinder ht_mf;
+
+			/* Matches and literals chosen for the current block */
+			struct deflate_sequence sequences[
+						FAST_SEQ_STORE_LENGTH + 1];
+
+		} f; /* (f)astest */
+
+	#if SUPPORT_NEAR_OPTIMAL_PARSING
+		/* Data for near-optimal parsing */
+		struct {
+
+			/* Binary tree matchfinder */
+			struct bt_matchfinder bt_mf;
+
+			/*
+			 * Cached matches for the current block.  This array
+			 * contains the matches that were found at each position
+			 * in the block.  Specifically, for each position, there
+			 * is a list of matches found at that position, if any,
+			 * sorted by strictly increasing length.  In addition,
+			 * following the matches for each position, there is a
+			 * special 'struct lz_match' whose 'length' member
+			 * contains the number of matches found at that
+			 * position, and whose 'offset' member contains the
+			 * literal at that position.
+			 *
+			 * Note: in rare cases, there will be a very high number
+			 * of matches in the block and this array will overflow.
+			 * If this happens, we force the end of the current
+			 * block.  MATCH_CACHE_LENGTH is the length at which we
+			 * actually check for overflow.  The extra slots beyond
+			 * this are enough to absorb the worst case overflow,
+			 * which occurs if starting at
+			 * &match_cache[MATCH_CACHE_LENGTH - 1], we write
+			 * MAX_MATCHES_PER_POS matches and a match count header,
+			 * then skip searching for matches at
+			 * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
+			 * match count header for each.
+			 */
+			struct lz_match match_cache[MATCH_CACHE_LENGTH +
+						    MAX_MATCHES_PER_POS +
+						    DEFLATE_MAX_MATCH_LEN - 1];
+
+			/*
+			 * Array of nodes, one per position, for running the
+			 * minimum-cost path algorithm.
+			 *
+			 * This array must be large enough to accommodate the
+			 * worst-case number of nodes, which is MAX_BLOCK_LENGTH
+			 * plus 1 for the end-of-block node.
+			 */
+			struct deflate_optimum_node optimum_nodes[
+				MAX_BLOCK_LENGTH + 1];
+
+			/* The current cost model being used */
+			struct deflate_costs costs;
+
+			/* Saved cost model */
+			struct deflate_costs costs_saved;
+
+			/*
+			 * A table that maps match offset to offset slot.  This
+			 * differs from deflate_offset_slot[] in that this is a
+			 * full map, not a condensed one.  The full map is more
+			 * appropriate for the near-optimal parser, since the
+			 * near-optimal parser does more offset => offset_slot
+			 * translations, it doesn't intersperse them with
+			 * matchfinding (so cache evictions are less of a
+			 * concern), and it uses more memory anyway.
+			 */
+			u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
+
+			/* Literal/match statistics saved from previous block */
+			u32 prev_observations[NUM_OBSERVATION_TYPES];
+			u32 prev_num_observations;
+
+			/*
+			 * Approximate match length frequencies based on a
+			 * greedy parse, gathered during matchfinding.  This is
+			 * used for setting the initial symbol costs.
+			 */
+			u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+
+			/*
+			 * The maximum number of optimization passes
+			 * (min-cost path searches) per block.
+			 * Larger values = more compression.
+			 */
+			unsigned max_optim_passes;
+
+			/*
+			 * If an optimization pass improves the cost by fewer
+			 * than this number of bits, then optimization will stop
+			 * early, before max_optim_passes has been reached.
+			 * Smaller values = more compression.
+			 */
+			unsigned min_improvement_to_continue;
+
+			/*
+			 * The minimum number of bits that would need to be
+			 * saved for it to be considered worth the time to
+			 * regenerate and use the min-cost path from a previous
+			 * optimization pass, in the case where the final
+			 * optimization pass actually increased the cost.
+			 * Smaller values = more compression.
+			 */
+			unsigned min_bits_to_use_nonfinal_path;
+
+			/*
+			 * The maximum block length, in uncompressed bytes, at
+			 * which to find and consider the optimal match/literal
+			 * list for the static Huffman codes.  This strategy
+			 * improves the compression ratio produced by static
+			 * Huffman blocks and can discover more cases in which
+			 * static blocks are worthwhile.  This helps mostly with
+			 * small blocks, hence why this parameter is a max_len.
+			 *
+			 * Above this block length, static Huffman blocks are
+			 * only used opportunistically.  I.e. a static Huffman
+			 * block is only used if a static block using the same
+			 * match/literal list as the optimized dynamic block
+			 * happens to be cheaper than the dynamic block itself.
+			 */
+			unsigned max_len_to_optimize_static_block;
+
+		} n; /* (n)ear-optimal */
+	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+	} p; /* (p)arser */
+};
+
+/*
+ * The type for the bitbuffer variable, which temporarily holds bits that are
+ * being packed into bytes and written to the output buffer.  For best
+ * performance, this should have size equal to a machine word.
+ */
+typedef machine_word_t bitbuf_t;
+
+/*
+ * The capacity of the bitbuffer, in bits.  This is 1 less than the real size,
+ * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7.
+ */
+#define BITBUF_NBITS	(8 * sizeof(bitbuf_t) - 1)
+
+/*
+ * Can the specified number of bits always be added to 'bitbuf' after any
+ * pending bytes have been flushed?  There can be up to 7 bits remaining after a
+ * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits.
+ */
+#define CAN_BUFFER(n)	(7 + (n) <= BITBUF_NBITS)
+
+/*
+ * Structure to keep track of the current state of sending bits to the
+ * compressed output buffer
+ */
+struct deflate_output_bitstream {
+
+	/* Bits that haven't yet been written to the output buffer */
+	bitbuf_t bitbuf;
+
+	/*
+	 * Number of bits currently held in @bitbuf.  This can be between 0 and
+	 * BITBUF_NBITS in general, or between 0 and 7 after a flush.
+	 */
+	unsigned bitcount;
+
+	/*
+	 * Pointer to the position in the output buffer at which the next byte
+	 * should be written
+	 */
+	u8 *next;
+
+	/* Pointer to the end of the output buffer */
+	u8 *end;
+
+	/* true if the output buffer ran out of space */
+	bool overflow;
+};
+
+/*
+ * Add some bits to the bitbuffer variable of the output bitstream.  The caller
+ * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS()
+ * frequently enough.
+ */
+#define ADD_BITS(bits, n)			\
+do {						\
+	bitbuf |= (bitbuf_t)(bits) << bitcount;	\
+	bitcount += (n);			\
+	ASSERT(bitcount <= BITBUF_NBITS);	\
+} while (0)
+
+/*
+ * Flush bits from the bitbuffer variable to the output buffer.  After this, the
+ * bitbuffer will contain at most 7 bits (a partial byte).
+ *
+ * Since deflate_flush_block() verified ahead of time that there is enough space
+ * remaining before actually writing the block, it's guaranteed that out_next
+ * won't exceed os->end.  However, there might not be enough space remaining to
+ * flush a whole word, even though that's fastest.  Therefore, flush a whole
+ * word if there is space for it, otherwise flush a byte at a time.
+ */
+#define FLUSH_BITS()							\
+do {									\
+	if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \
+		/* Flush a whole word (branchlessly). */		\
+		put_unaligned_leword(bitbuf, out_next);			\
+		bitbuf >>= bitcount & ~7;				\
+		out_next += bitcount >> 3;				\
+		bitcount &= 7;						\
+	} else {							\
+		/* Flush a byte at a time. */				\
+		while (bitcount >= 8) {					\
+			ASSERT(out_next < os->end);			\
+			*out_next++ = bitbuf;				\
+			bitcount -= 8;					\
+			bitbuf >>= 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
+ */
+static void
+heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
+{
+	unsigned parent_idx;
+	unsigned child_idx;
+	u32 v;
+
+	v = A[subtree_idx];
+	parent_idx = subtree_idx;
+	while ((child_idx = parent_idx * 2) <= length) {
+		if (child_idx < length && A[child_idx + 1] > A[child_idx])
+			child_idx++;
+		if (v >= A[child_idx])
+			break;
+		A[parent_idx] = A[child_idx];
+		parent_idx = child_idx;
+	}
+	A[parent_idx] = v;
+}
+
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(u32 A[], unsigned length)
+{
+	unsigned subtree_idx;
+
+	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+		heapify_subtree(A, length, subtree_idx);
+}
+
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems.
+ */
+static void
+heap_sort(u32 A[], unsigned length)
+{
+	A--; /* Use 1-based indices  */
+
+	heapify_array(A, length);
+
+	while (length >= 2) {
+		u32 tmp = A[length];
+
+		A[length] = A[1];
+		A[1] = tmp;
+		length--;
+		heapify_subtree(A, length, 1);
+	}
+}
+
+#define NUM_SYMBOL_BITS 10
+#define NUM_FREQ_BITS	(32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK	((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK	(~SYMBOL_MASK)
+
+#define GET_NUM_COUNTERS(num_syms)	(num_syms)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
+ *
+ * @freqs[num_syms]
+ *	Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
+ *
+ * @lens[num_syms]
+ *	An array that eventually will hold the length of each codeword.  This
+ *	function only fills in the codeword lengths for symbols that have zero
+ *	frequency, which are not well defined per se but will be set to 0.
+ *
+ * @symout[num_syms]
+ *	The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
+ */
+static unsigned
+sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
+{
+	unsigned sym;
+	unsigned i;
+	unsigned num_used_syms;
+	unsigned num_counters;
+	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
+
+	/*
+	 * We use heapsort, but with an added optimization.  Since often most
+	 * symbol frequencies are low, we first do a count sort using a limited
+	 * number of counters.  High frequencies are counted in the last
+	 * counter, and only they will be sorted with heapsort.
+	 *
+	 * Note: with more symbols, it is generally beneficial to have more
+	 * counters.  About 1 counter per symbol seems fastest.
+	 */
+
+	num_counters = GET_NUM_COUNTERS(num_syms);
+
+	__builtin_memset(counters, 0, num_counters * sizeof(counters[0]));
+
+	/* Count the frequencies. */
+	for (sym = 0; sym < num_syms; sym++)
+		counters[MIN(freqs[sym], num_counters - 1)]++;
+
+	/*
+	 * Make the counters cumulative, ignoring the zero-th, which counted
+	 * symbols with zero frequency.  As a side effect, this calculates the
+	 * number of symbols with nonzero frequency.
+	 */
+	num_used_syms = 0;
+	for (i = 1; i < num_counters; i++) {
+		unsigned count = counters[i];
+
+		counters[i] = num_used_syms;
+		num_used_syms += count;
+	}
+
+	/*
+	 * Sort nonzero-frequency symbols using the counters.  At the same time,
+	 * set the codeword lengths of zero-frequency symbols to 0.
+	 */
+	for (sym = 0; sym < num_syms; sym++) {
+		u32 freq = freqs[sym];
+
+		if (freq != 0) {
+			symout[counters[MIN(freq, num_counters - 1)]++] =
+				sym | (freq << NUM_SYMBOL_BITS);
+		} else {
+			lens[sym] = 0;
+		}
+	}
+
+	/* Sort the symbols counted in the last counter. */
+	heap_sort(symout + counters[num_counters - 2],
+		  counters[num_counters - 1] - counters[num_counters - 2]);
+
+	return num_used_syms;
+}
+
+/*
+ * Build a Huffman tree.
+ *
+ * This is an optimized implementation that
+ *	(a) takes advantage of the frequencies being already sorted;
+ *	(b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *	    tree are sufficient to generate a canonical code;
+ *	(c) Only stores parent pointers, not child pointers;
+ *	(d) Produces the nodes in the same memory used for input frequency
+ *	    information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(u32 A[], unsigned sym_count)
+{
+	const unsigned last_idx = sym_count - 1;
+
+	/* Index of the next lowest frequency leaf that still needs a parent */
+	unsigned i = 0;
+
+	/*
+	 * Index of the next lowest frequency non-leaf that still needs a
+	 * parent, or 'e' if there is currently no such node
+	 */
+	unsigned b = 0;
+
+	/* Index of the next spot for a non-leaf (will overwrite a leaf) */
+	unsigned e = 0;
+
+	do {
+		u32 new_freq;
+
+		/*
+		 * Select the next two lowest frequency nodes among the leaves
+		 * A[i] and non-leaves A[b], and create a new node A[e] to be
+		 * their parent.  Set the new node's frequency to the sum of the
+		 * frequencies of its two children.
+		 *
+		 * Usually the next two lowest frequency nodes are of the same
+		 * type (leaf or non-leaf), so check those cases first.
+		 */
+		if (i + 1 <= last_idx &&
+		    (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+			/* Two leaves */
+			new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+			i += 2;
+		} else if (b + 2 <= e &&
+			   (i > last_idx ||
+			    (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+			/* Two non-leaves */
+			new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+			A[b + 1] = (e << NUM_SYMBOL_BITS) |
+				   (A[b + 1] & SYMBOL_MASK);
+			b += 2;
+		} else {
+			/* One leaf and one non-leaf */
+			new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+			i++;
+			b++;
+		}
+		A[e] = new_freq | (A[e] & SYMBOL_MASK);
+		/*
+		 * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+		 * tree is complete once we've created 'n - 1' non-leaves.
+		 */
+	} while (++e < last_idx);
+}
+
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
+ *
+ * @A
+ *	The array produced by build_tree(), containing parent index information
+ *	for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *	a node; a node's parent always has a greater index than that node
+ *	itself.  This function will overwrite the parent index information in
+ *	this array, so essentially it will destroy the tree.  However, the data
+ *	in the low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *	The 0-based index of the root node in 'A', and consequently one less
+ *	than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *	the actual length of 'A'.)
+ *
+ * @len_counts
+ *	An array of length ('max_codeword_len' + 1) in which the number of
+ *	codewords having each length <= max_codeword_len will be returned.
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
+		      unsigned max_codeword_len)
+{
+	unsigned len;
+	int node;
+
+	/*
+	 * The key observations are:
+	 *
+	 * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+	 *     parent before its children, by simply iterating through the array
+	 *     in reverse order.  Consequently, we can compute the depth of each
+	 *     node in one pass, overwriting the parent indices with depths.
+	 *
+	 * (2) We can initially assume that in the real Huffman tree, both
+	 *     children of the root are leaves.  This corresponds to two
+	 *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+	 *     during the traversal, we modify this assumption to account for
+	 *     the current node *not* being a leaf, but rather its two children
+	 *     being leaves.  This causes the loss of one codeword for the
+	 *     current depth and the addition of two codewords for the current
+	 *     depth plus one.
+	 *
+	 * (3) We can handle the length-limited constraint fairly easily by
+	 *     simply using the largest length available when a depth exceeds
+	 *     max_codeword_len.
+	 */
+
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	len_counts[1] = 2;
+
+	/* Set the root node's depth to 0. */
+	A[root_idx] &= SYMBOL_MASK;
+
+	for (node = root_idx - 1; node >= 0; node--) {
+
+		/* Calculate the depth of this node. */
+
+		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+		unsigned depth = parent_depth + 1;
+
+		/*
+		 * Set the depth of this node so that it is available when its
+		 * children (if any) are processed.
+		 */
+		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+		/*
+		 * If needed, decrease the length to meet the length-limited
+		 * constraint.  This is not the optimal method for generating
+		 * length-limited Huffman codes!  But it should be good enough.
+		 */
+		if (depth >= max_codeword_len) {
+			depth = max_codeword_len;
+			do {
+				depth--;
+			} while (len_counts[depth] == 0);
+		}
+
+		/*
+		 * Account for the fact that we have a non-leaf node at the
+		 * current depth.
+		 */
+		len_counts[depth]--;
+		len_counts[depth + 1] += 2;
+	}
+}
+
+/*
+ * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords
+ * after generating them.  All codewords have length <= 16 bits.  If the CPU has
+ * a bit-reversal instruction, then that is the fastest method.  Otherwise the
+ * fastest method is to reverse the bits in each of the two bytes using a table.
+ * The table method is slightly faster than using bitwise operations to flip
+ * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed
+ * into a machine word and processed together using that method.
+ */
+
+#ifdef rbit32
+static u32 reverse_codeword(u32 codeword, u8 len)
+{
+	return rbit32(codeword) >> ((32 - len) & 31);
+}
+#else
+/* Generated by scripts/gen_bitreverse_tab.py */
+static const u8 bitreverse_tab[256] = {
+	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+};
+
+static u32 reverse_codeword(u32 codeword, u8 len)
+{
+	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
+	codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) |
+		   bitreverse_tab[codeword >> 8];
+	return codeword >> (16 - len);
+}
+#endif /* !rbit32 */
+
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *	The output array for codewords.  In addition, initially this
+ *	array must contain the symbols, sorted primarily by frequency and
+ *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *	each entry.
+ *
+ * @len
+ *	Output array for codeword lengths.
+ *
+ * @len_counts
+ *	An array that provides the number of codewords that will have
+ *	each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *	Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet, including symbols with zero
+ *	frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
+	      unsigned max_codeword_len, unsigned num_syms)
+{
+	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned i;
+	unsigned len;
+	unsigned sym;
+
+	/*
+	 * Given the number of codewords that will have each length, assign
+	 * codeword lengths to symbols.  We do this by assigning the lengths in
+	 * decreasing order to the symbols sorted primarily by increasing
+	 * frequency and secondarily by increasing symbol value.
+	 */
+	for (i = 0, len = max_codeword_len; len >= 1; len--) {
+		unsigned count = len_counts[len];
+
+		while (count--)
+			lens[A[i++] & SYMBOL_MASK] = len;
+	}
+
+	/*
+	 * Generate the codewords themselves.  We initialize the
+	 * 'next_codewords' array to provide the lexicographically first
+	 * codeword of each length, then assign codewords in symbol order.  This
+	 * produces a canonical code.
+	 */
+	next_codewords[0] = 0;
+	next_codewords[1] = 0;
+	for (len = 2; len <= max_codeword_len; len++)
+		next_codewords[len] =
+			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+	for (sym = 0; sym < num_syms; sym++) {
+		/* DEFLATE requires bit-reversed codewords. */
+		A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
+					  lens[sym]);
+	}
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *			deflate_make_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *	The number of symbols in the alphabet.  The symbols are the integers in
+ *	the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *	must not exceed (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ *
+ * @freqs
+ *	An array of length @num_syms that gives the frequency of each symbol.
+ *	It is valid for some, none, or all of the frequencies to be 0.  The sum
+ *	of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
+ *
+ * @lens
+ *	An array of @num_syms entries in which this function will return the
+ *	length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *	0 frequency will not have codewords per se, but their entries in this
+ *	array will be set to 0.  No lengths greater than @max_codeword_len will
+ *	be assigned.
+ *
+ * @codewords
+ *	An array of @num_syms entries in which this function will return the
+ *	codeword for each symbol, right-justified and padded on the left with
+ *	zeroes.  Codewords for symbols with 0 frequency will be undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
+ *
+ *	struct huffman_tree_node {
+ *		unsigned int symbol;
+ *		unsigned int frequency;
+ *		unsigned int depth;
+ *		struct huffman_tree_node *left_child;
+ *		struct huffman_tree_node *right_child;
+ *	};
+ *
+ *
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ */
+static void
+deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+			  const u32 freqs[], u8 lens[], u32 codewords[])
+{
+	u32 *A = codewords;
+	unsigned num_used_syms;
+
+	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+	STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1);
+
+	/*
+	 * We begin by sorting the symbols primarily by frequency and
+	 * secondarily by symbol value.  As an optimization, the array used for
+	 * this purpose ('A') shares storage with the space in which we will
+	 * eventually return the codewords.
+	 */
+	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+	/*
+	 * 'num_used_syms' is the number of symbols with nonzero frequency.
+	 * This may be less than @num_syms.  'num_used_syms' is also the number
+	 * of entries in 'A' that are valid.  Each entry consists of a distinct
+	 * symbol and a nonzero frequency packed into a 32-bit integer.
+	 */
+
+	/*
+	 * A complete Huffman code must contain at least 2 codewords.  Yet, it's
+	 * possible that fewer than 2 symbols were used.  When this happens,
+	 * it's usually for the offset code (0-1 symbols used).  But it's also
+	 * theoretically possible for the litlen and pre codes (1 symbol used).
+	 *
+	 * The DEFLATE RFC explicitly allows the offset code to contain just 1
+	 * codeword, or even be completely empty.  But it's silent about the
+	 * other codes.  It also doesn't say whether, in the 1-codeword case,
+	 * the codeword (which it says must be 1 bit) is '0' or '1'.
+	 *
+	 * In any case, some DEFLATE decompressors reject these cases.  zlib
+	 * generally allows them, but it does reject precodes that have just 1
+	 * codeword.  More problematically, zlib v1.2.1 and earlier rejected
+	 * empty offset codes, and this behavior can also be seen in Windows
+	 * Explorer's ZIP unpacker (supposedly even still in Windows 11).
+	 *
+	 * Other DEFLATE compressors, including zlib, always send at least 2
+	 * codewords in order to make a complete Huffman code.  Therefore, this
+	 * is a case where practice does not entirely match the specification.
+	 * We follow practice by generating 2 codewords of length 1: codeword
+	 * '0' for symbol 0, and codeword '1' for another symbol -- the used
+	 * symbol if it exists and is not symbol 0, otherwise symbol 1.  This
+	 * does worsen the compression ratio by having to send an unnecessary
+	 * offset codeword length.  But this only affects rare cases such as
+	 * blocks containing all literals, and it only makes a tiny difference.
+	 */
+	if (unlikely(num_used_syms < 2)) {
+		unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0;
+		unsigned nonzero_idx = sym ? sym : 1;
+
+		codewords[0] = 0;
+		lens[0] = 1;
+		codewords[nonzero_idx] = 1;
+		lens[nonzero_idx] = 1;
+		return;
+	}
+
+	/*
+	 * Build a stripped-down version of the Huffman tree, sharing the array
+	 * 'A' with the symbol values.  Then extract length counts from the tree
+	 * and use them to generate the final codewords.
+	 */
+
+	build_tree(A, num_used_syms);
+
+	{
+		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+
+		compute_length_counts(A, num_used_syms - 2,
+				      len_counts, max_codeword_len);
+
+		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
+	}
+}
+
+/*
+ * Clear the Huffman symbol frequency counters.  This must be called when
+ * starting a new DEFLATE block.
+ */
+static void
+deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
+{
+	__builtin_memset(&c->freqs, 0, sizeof(c->freqs));
+}
+
+/*
+ * Build the literal/length and offset Huffman codes for a DEFLATE block.
+ *
+ * This takes as input the frequency tables for each alphabet and produces as
+ * output a set of tables that map symbols to codewords and codeword lengths.
+ */
+static void
+deflate_make_huffman_codes(const struct deflate_freqs *freqs,
+			   struct deflate_codes *codes)
+{
+	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
+				  MAX_LITLEN_CODEWORD_LEN,
+				  freqs->litlen,
+				  codes->lens.litlen,
+				  codes->codewords.litlen);
+
+	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
+				  MAX_OFFSET_CODEWORD_LEN,
+				  freqs->offset,
+				  codes->lens.offset,
+				  codes->codewords.offset);
+}
+
+/* Initialize c->static_codes. */
+static void
+deflate_init_static_codes(struct libdeflate_compressor *c)
+{
+	unsigned i;
+
+	for (i = 0; i < 144; i++)
+		c->freqs.litlen[i] = 1 << (9 - 8);
+	for (; i < 256; i++)
+		c->freqs.litlen[i] = 1 << (9 - 9);
+	for (; i < 280; i++)
+		c->freqs.litlen[i] = 1 << (9 - 7);
+	for (; i < 288; i++)
+		c->freqs.litlen[i] = 1 << (9 - 8);
+
+	for (i = 0; i < 32; i++)
+		c->freqs.offset[i] = 1 << (5 - 5);
+
+	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
+}
+
+/* Return the offset slot for the given match offset, using the small map. */
+static unsigned
+deflate_get_offset_slot(u32 offset)
+{
+	/*
+	 * 1 <= offset <= 32768 here.  For 1 <= offset <= 256,
+	 * deflate_offset_slot[offset - 1] gives the slot.
+	 *
+	 * For 257 <= offset <= 32768, we take advantage of the fact that 257 is
+	 * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 ==
+	 * 128 times larger than each slot [2..16) (since the number of extra
+	 * bits increases by 1 every 2 slots).  Thus, the slot is:
+	 *
+	 *	deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2)
+	 *   == deflate_offset_slot[((offset - 1) >> 7)] + 14
+	 *
+	 * Define 'n = (offset <= 256) ? 0 : 7'.  Then any offset is handled by:
+	 *
+	 *      deflate_offset_slot[(offset - 1) >> n] + (n << 1)
+	 *
+	 * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with
+	 * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'.
+	 */
+	unsigned n = (256 - offset) >> 29;
+
+	return deflate_offset_slot[(offset - 1) >> n] + (n << 1);
+}
+
+static unsigned
+deflate_compute_precode_items(const u8 lens[], const unsigned num_lens,
+			      u32 precode_freqs[], unsigned precode_items[])
+{
+	unsigned *itemptr;
+	unsigned run_start;
+	unsigned run_end;
+	unsigned extra_bits;
+	u8 len;
+
+	__builtin_memset(precode_freqs, 0,
+	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
+
+	itemptr = precode_items;
+	run_start = 0;
+	do {
+		/* Find the next run of codeword lengths. */
+
+		/* len = the length being repeated */
+		len = lens[run_start];
+
+		/* Extend the run. */
+		run_end = run_start;
+		do {
+			run_end++;
+		} while (run_end != num_lens && len == lens[run_end]);
+
+		if (len == 0) {
+			/* Run of zeroes. */
+
+			/* Symbol 18: RLE 11 to 138 zeroes at a time. */
+			while ((run_end - run_start) >= 11) {
+				extra_bits = MIN((run_end - run_start) - 11,
+						 0x7F);
+				precode_freqs[18]++;
+				*itemptr++ = 18 | (extra_bits << 5);
+				run_start += 11 + extra_bits;
+			}
+
+			/* Symbol 17: RLE 3 to 10 zeroes at a time. */
+			if ((run_end - run_start) >= 3) {
+				extra_bits = MIN((run_end - run_start) - 3,
+						 0x7);
+				precode_freqs[17]++;
+				*itemptr++ = 17 | (extra_bits << 5);
+				run_start += 3 + extra_bits;
+			}
+		} else {
+
+			/* A run of nonzero lengths. */
+
+			/* Symbol 16: RLE 3 to 6 of the previous length. */
+			if ((run_end - run_start) >= 4) {
+				precode_freqs[len]++;
+				*itemptr++ = len;
+				run_start++;
+				do {
+					extra_bits = MIN((run_end - run_start) -
+							 3, 0x3);
+					precode_freqs[16]++;
+					*itemptr++ = 16 | (extra_bits << 5);
+					run_start += 3 + extra_bits;
+				} while ((run_end - run_start) >= 3);
+			}
+		}
+
+		/* Output any remaining lengths without RLE. */
+		while (run_start != run_end) {
+			precode_freqs[len]++;
+			*itemptr++ = len;
+			run_start++;
+		}
+	} while (run_start != num_lens);
+
+	return itemptr - precode_items;
+}
+
+/*
+ * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
+ * separate Huffman code, the "precode", which contains a symbol for each
+ * possible codeword length in the larger code as well as several special
+ * symbols to represent repeated codeword lengths (a form of run-length
+ * encoding).  The precode is itself constructed in canonical form, and its
+ * codeword lengths are represented literally in 19 3-bit fields that
+ * immediately precede the compressed codeword lengths of the larger code.
+ */
+
+/* Precompute the information needed to output dynamic Huffman codes. */
+static void
+deflate_precompute_huffman_header(struct libdeflate_compressor *c)
+{
+	/* Compute how many litlen and offset symbols are needed. */
+
+	for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
+	     c->o.precode.num_litlen_syms > 257;
+	     c->o.precode.num_litlen_syms--)
+		if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0)
+			break;
+
+	for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
+	     c->o.precode.num_offset_syms > 1;
+	     c->o.precode.num_offset_syms--)
+		if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0)
+			break;
+
+	/*
+	 * If we're not using the full set of literal/length codeword lengths,
+	 * then temporarily move the offset codeword lengths over so that the
+	 * literal/length and offset codeword lengths are contiguous.
+	 */
+	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
+		      DEFLATE_NUM_LITLEN_SYMS);
+	if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+		__builtin_memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
+			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+			c->o.precode.num_offset_syms);
+	}
+
+	/*
+	 * Compute the "items" (RLE / literal tokens and extra bits) with which
+	 * the codeword lengths in the larger code will be output.
+	 */
+	c->o.precode.num_items =
+		deflate_compute_precode_items((u8 *)&c->codes.lens,
+					      c->o.precode.num_litlen_syms +
+					      c->o.precode.num_offset_syms,
+					      c->o.precode.freqs,
+					      c->o.precode.items);
+
+	/* Build the precode. */
+	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
+				  MAX_PRE_CODEWORD_LEN,
+				  c->o.precode.freqs, c->o.precode.lens,
+				  c->o.precode.codewords);
+
+	/* Count how many precode lengths we actually need to output. */
+	for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
+	     c->o.precode.num_explicit_lens > 4;
+	     c->o.precode.num_explicit_lens--)
+		if (c->o.precode.lens[deflate_precode_lens_permutation[
+				c->o.precode.num_explicit_lens - 1]] != 0)
+			break;
+
+	/* Restore the offset codeword lengths if needed. */
+	if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+		__builtin_memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+			(u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
+			c->o.precode.num_offset_syms);
+	}
+}
+
+/*
+ * To make it faster to output matches, compute the "full" match length
+ * codewords, i.e. the concatenation of the litlen codeword and the extra bits
+ * for each possible match length.
+ */
+static void
+deflate_compute_full_len_codewords(struct libdeflate_compressor *c,
+				   const struct deflate_codes *codes)
+{
+	unsigned len;
+
+	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN +
+		      DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32);
+
+	for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) {
+		unsigned slot = deflate_length_slot[len];
+		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot;
+		u32 extra_bits = len - deflate_length_slot_base[slot];
+
+		c->o.length.codewords[len] =
+			codes->codewords.litlen[litlen_sym] |
+			(extra_bits << codes->lens.litlen[litlen_sym]);
+		c->o.length.lens[len] = codes->lens.litlen[litlen_sym] +
+					deflate_extra_length_bits[slot];
+	}
+}
+
+/* Write a match to the output buffer. */
+#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_)		\
+do {									\
+	const struct libdeflate_compressor *c__ = (c_);			\
+	const struct deflate_codes *codes__ = (codes_);			\
+	unsigned length__ = (length_);					\
+	unsigned offset__ = (offset_);					\
+	unsigned offset_slot__ = (offset_slot_);			\
+									\
+	/* Litlen symbol and extra length bits */			\
+	STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +		\
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS));	\
+	ADD_BITS(c__->o.length.codewords[length__],			\
+		 c__->o.length.lens[length__]);				\
+									\
+	if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +			\
+			DEFLATE_MAX_EXTRA_LENGTH_BITS +			\
+			MAX_OFFSET_CODEWORD_LEN +			\
+			DEFLATE_MAX_EXTRA_OFFSET_BITS))			\
+		FLUSH_BITS();						\
+									\
+	/* Offset symbol */						\
+	ADD_BITS(codes__->codewords.offset[offset_slot__],		\
+		 codes__->lens.offset[offset_slot__]);			\
+									\
+	if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +			\
+			DEFLATE_MAX_EXTRA_OFFSET_BITS))			\
+		FLUSH_BITS();						\
+									\
+	/* Extra offset bits */						\
+	ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__],	\
+		 deflate_extra_offset_bits[offset_slot__]);		\
+									\
+	FLUSH_BITS();							\
+} while (0)
+
+/*
+ * Choose the best type of block to use (dynamic Huffman, static Huffman, or
+ * uncompressed), then output it.
+ *
+ * The uncompressed data of the block is @block_begin[0..@block_length-1].  The
+ * sequence of literals and matches that will be used to compress the block (if
+ * a compressed block is chosen) is given by @sequences if it's non-NULL, or
+ * else @c->p.n.optimum_nodes.  @c->freqs and @c->codes must be already set
+ * according to the literals, matches, and end-of-block symbol.
+ */
+static void
+deflate_flush_block(struct libdeflate_compressor *c,
+		    struct deflate_output_bitstream *os,
+		    const u8 *block_begin, u32 block_length,
+		    const struct deflate_sequence *sequences,
+		    bool is_final_block)
+{
+	/*
+	 * It is hard to get compilers to understand that writes to 'os->next'
+	 * don't alias 'os'.  That hurts performance significantly, as
+	 * everything in 'os' would keep getting re-loaded.  ('restrict'
+	 * *should* do the trick, but it's unreliable.)  Therefore, we keep all
+	 * the output bitstream state in local variables, and output bits using
+	 * macros.  This is similar to what the decompressor does.
+	 */
+	const u8 *in_next = block_begin;
+	const u8 * const in_end = block_begin + block_length;
+	bitbuf_t bitbuf = os->bitbuf;
+	unsigned bitcount = os->bitcount;
+	u8 *out_next = os->next;
+	u8 * const out_fast_end =
+		os->end - MIN(WORDBYTES - 1, os->end - out_next);
+	/*
+	 * The cost for each block type, in bits.  Start with the cost of the
+	 * block header which is 3 bits.
+	 */
+	u32 dynamic_cost = 3;
+	u32 static_cost = 3;
+	u32 uncompressed_cost = 3;
+	u32 best_cost;
+	struct deflate_codes *codes;
+	unsigned sym;
+
+	ASSERT(block_length >= MIN_BLOCK_LENGTH ||
+	       (is_final_block && block_length > 0));
+	ASSERT(block_length <= MAX_BLOCK_LENGTH);
+	ASSERT(bitcount <= 7);
+	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
+	ASSERT(out_next <= os->end);
+	ASSERT(!os->overflow);
+
+	/* Precompute the precode items and build the precode. */
+	deflate_precompute_huffman_header(c);
+
+	/* Account for the cost of encoding dynamic Huffman codes. */
+	dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
+	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+		u32 extra = deflate_extra_precode_bits[sym];
+
+		dynamic_cost += c->o.precode.freqs[sym] *
+				(extra + c->o.precode.lens[sym]);
+	}
+
+	/* Account for the cost of encoding literals. */
+	for (sym = 0; sym < 144; sym++) {
+		dynamic_cost += c->freqs.litlen[sym] *
+				c->codes.lens.litlen[sym];
+		static_cost += c->freqs.litlen[sym] * 8;
+	}
+	for (; sym < 256; sym++) {
+		dynamic_cost += c->freqs.litlen[sym] *
+				c->codes.lens.litlen[sym];
+		static_cost += c->freqs.litlen[sym] * 9;
+	}
+
+	/* Account for the cost of encoding the end-of-block symbol. */
+	dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
+	static_cost += 7;
+
+	/* Account for the cost of encoding lengths. */
+	for (sym = DEFLATE_FIRST_LEN_SYM;
+	     sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
+	     sym++) {
+		u32 extra = deflate_extra_length_bits[
+					sym - DEFLATE_FIRST_LEN_SYM];
+
+		dynamic_cost += c->freqs.litlen[sym] *
+				(extra + c->codes.lens.litlen[sym]);
+		static_cost += c->freqs.litlen[sym] *
+				(extra + c->static_codes.lens.litlen[sym]);
+	}
+
+	/* Account for the cost of encoding offsets. */
+	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
+		u32 extra = deflate_extra_offset_bits[sym];
+
+		dynamic_cost += c->freqs.offset[sym] *
+				(extra + c->codes.lens.offset[sym]);
+		static_cost += c->freqs.offset[sym] * (extra + 5);
+	}
+
+	/* Compute the cost of using uncompressed blocks. */
+	uncompressed_cost += (-(bitcount + 3) & 7) + 32 +
+			     (40 * (DIV_ROUND_UP(block_length,
+						 UINT16_MAX) - 1)) +
+			     (8 * block_length);
+
+	/*
+	 * Choose and output the cheapest type of block.  If there is a tie,
+	 * prefer uncompressed, then static, then dynamic.
+	 */
+
+	best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost));
+
+	/* If the block isn't going to fit, then stop early. */
+	if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) {
+		os->overflow = true;
+		return;
+	}
+	/*
+	 * Else, now we know that the block fits, so no further bounds checks on
+	 * the output buffer are required until the next block.
+	 */
+
+	if (best_cost == uncompressed_cost) {
+		/*
+		 * Uncompressed block(s).  DEFLATE limits the length of
+		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
+		 * the "block" we're flushing is over UINT16_MAX, we actually
+		 * output multiple blocks.
+		 */
+		do {
+			u8 bfinal = 0;
+			size_t len = UINT16_MAX;
+
+			if (in_end - in_next <= UINT16_MAX) {
+				bfinal = is_final_block;
+				len = in_end - in_next;
+			}
+			/* It was already checked that there is enough space. */
+			ASSERT(os->end - out_next >=
+			       DIV_ROUND_UP(bitcount + 3, 8) + 4 + len);
+			/*
+			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
+			 * to a byte boundary.
+			 */
+			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
+			*out_next++ = (bfinal << bitcount) | bitbuf;
+			if (bitcount > 5)
+				*out_next++ = 0;
+			bitbuf = 0;
+			bitcount = 0;
+			/* Output LEN and NLEN, then the data itself. */
+			put_unaligned_le16(len, out_next);
+			out_next += 2;
+			put_unaligned_le16(~len, out_next);
+			out_next += 2;
+			__builtin_memcpy(out_next, in_next, len);
+			out_next += len;
+			in_next += len;
+		} while (in_next != in_end);
+		/* Done outputting uncompressed block(s) */
+		goto out;
+	}
+
+	if (best_cost == static_cost) {
+		/* Static Huffman block */
+		codes = &c->static_codes;
+		ADD_BITS(is_final_block, 1);
+		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
+		FLUSH_BITS();
+	} else {
+		const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
+		const unsigned num_precode_items = c->o.precode.num_items;
+		unsigned precode_sym, precode_item;
+		unsigned i;
+
+		/* Dynamic Huffman block */
+
+		codes = &c->codes;
+		STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
+		ADD_BITS(is_final_block, 1);
+		ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2);
+		ADD_BITS(c->o.precode.num_litlen_syms - 257, 5);
+		ADD_BITS(c->o.precode.num_offset_syms - 1, 5);
+		ADD_BITS(num_explicit_lens - 4, 4);
+
+		/* Output the lengths of the codewords in the precode. */
+		if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			/*
+			 * A 64-bit bitbuffer is just one bit too small to hold
+			 * the maximum number of precode lens, so to minimize
+			 * flushes we merge one len with the previous fields.
+			 */
+			precode_sym = deflate_precode_lens_permutation[0];
+			ADD_BITS(c->o.precode.lens[precode_sym], 3);
+			FLUSH_BITS();
+			i = 1; /* num_explicit_lens >= 4 */
+			do {
+				precode_sym =
+					deflate_precode_lens_permutation[i];
+				ADD_BITS(c->o.precode.lens[precode_sym], 3);
+			} while (++i < num_explicit_lens);
+			FLUSH_BITS();
+		} else {
+			FLUSH_BITS();
+			i = 0;
+			do {
+				precode_sym =
+					deflate_precode_lens_permutation[i];
+				ADD_BITS(c->o.precode.lens[precode_sym], 3);
+				FLUSH_BITS();
+			} while (++i < num_explicit_lens);
+		}
+
+		/*
+		 * Output the lengths of the codewords in the litlen and offset
+		 * codes, encoded by the precode.
+		 */
+		i = 0;
+		do {
+			precode_item = c->o.precode.items[i];
+			precode_sym = precode_item & 0x1F;
+			STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
+			ADD_BITS(c->o.precode.codewords[precode_sym],
+				 c->o.precode.lens[precode_sym]);
+			ADD_BITS(precode_item >> 5,
+				 deflate_extra_precode_bits[precode_sym]);
+			FLUSH_BITS();
+		} while (++i < num_precode_items);
+	}
+
+	/* Output the literals and matches for a dynamic or static block. */
+	ASSERT(bitcount <= 7);
+	deflate_compute_full_len_codewords(c, codes);
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	if (sequences == NULL) {
+		/* Output the literals and matches from the minimum-cost path */
+		struct deflate_optimum_node *cur_node =
+			&c->p.n.optimum_nodes[0];
+		struct deflate_optimum_node * const end_node =
+			&c->p.n.optimum_nodes[block_length];
+		do {
+			unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+			unsigned offset = cur_node->item >>
+					  OPTIMUM_OFFSET_SHIFT;
+			if (length == 1) {
+				/* Literal */
+				ADD_BITS(codes->codewords.litlen[offset],
+					 codes->lens.litlen[offset]);
+				FLUSH_BITS();
+			} else {
+				/* Match */
+				WRITE_MATCH(c, codes, length, offset,
+					    c->p.n.offset_slot_full[offset]);
+			}
+			cur_node += length;
+		} while (cur_node != end_node);
+	} else
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+	{
+		/* Output the literals and matches from the sequences list. */
+		const struct deflate_sequence *seq;
+
+		for (seq = sequences; ; seq++) {
+			u32 litrunlen = seq->litrunlen_and_length &
+					SEQ_LITRUNLEN_MASK;
+			unsigned length = seq->litrunlen_and_length >>
+					  SEQ_LENGTH_SHIFT;
+			unsigned lit;
+
+			/* Output a run of literals. */
+			if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) {
+				for (; litrunlen >= 4; litrunlen -= 4) {
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					FLUSH_BITS();
+				}
+				if (litrunlen-- != 0) {
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					if (litrunlen-- != 0) {
+						lit = *in_next++;
+						ADD_BITS(codes->codewords.litlen[lit],
+							 codes->lens.litlen[lit]);
+						if (litrunlen-- != 0) {
+							lit = *in_next++;
+							ADD_BITS(codes->codewords.litlen[lit],
+								 codes->lens.litlen[lit]);
+						}
+					}
+					FLUSH_BITS();
+				}
+			} else {
+				while (litrunlen--) {
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					FLUSH_BITS();
+				}
+			}
+
+			if (length == 0) { /* Last sequence? */
+				ASSERT(in_next == in_end);
+				break;
+			}
+
+			/* Output a match. */
+			WRITE_MATCH(c, codes, length, seq->offset,
+				    seq->offset_slot);
+			in_next += length;
+		}
+	}
+
+	/* Output the end-of-block symbol. */
+	ASSERT(bitcount <= 7);
+	ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
+		 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
+	FLUSH_BITS();
+out:
+	ASSERT(bitcount <= 7);
+	/*
+	 * Assert that the block cost was computed correctly.  This is relied on
+	 * above for the bounds check on the output buffer.  Also,
+	 * libdeflate_deflate_compress_bound() relies on this via the assumption
+	 * that uncompressed blocks will always be used when cheapest.
+	 */
+	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost);
+	os->bitbuf = bitbuf;
+	os->bitcount = bitcount;
+	os->next = out_next;
+}
+
+static void
+deflate_finish_block(struct libdeflate_compressor *c,
+		     struct deflate_output_bitstream *os,
+		     const u8 *block_begin, u32 block_length,
+		     const struct deflate_sequence *sequences,
+		     bool is_final_block)
+{
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	deflate_flush_block(c, os, block_begin, block_length, sequences,
+			    is_final_block);
+}
+
+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new Huffman codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticeable
+ * based only on changes in these aggregate probabilities, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the probability distributions are "different enough"
+ * to start a new block, the simple heuristic of splitting when the sum of
+ * absolute differences exceeds a constant seems to be good enough.  We also add
+ * a number proportional to the block length so that the algorithm is more
+ * likely to end long blocks than short blocks.  This reflects the general
+ * expectation that it will become increasingly beneficial to start a new block
+ * as the current block grows longer.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->new_observations[i] = 0;
+		stats->observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	stats->num_observations = 0;
+}
+
+/*
+ * Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.
+ */
+static void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+	stats->num_new_observations++;
+}
+
+/*
+ * Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".
+ */
+static void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
+				(length >= 9)]++;
+	stats->num_new_observations++;
+}
+
+static void
+merge_new_observations(struct block_split_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_observations += stats->num_new_observations;
+	stats->num_new_observations = 0;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_length)
+{
+	if (stats->num_observations > 0) {
+		/*
+		 * Compute the sum of absolute differences of probabilities.  To
+		 * avoid needing to use floating point arithmetic or do slow
+		 * divisions, we do all arithmetic with the probabilities
+		 * multiplied by num_observations * num_new_observations.  E.g.,
+		 * for the "old" observations the probabilities would be
+		 * (double)observations[i] / num_observations, but since we
+		 * multiply by both num_observations and num_new_observations we
+		 * really do observations[i] * num_new_observations.
+		 */
+		u32 total_delta = 0;
+		u32 num_items;
+		u32 cutoff;
+		int i;
+
+		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			u32 expected = stats->observations[i] *
+				       stats->num_new_observations;
+			u32 actual = stats->new_observations[i] *
+				     stats->num_observations;
+			u32 delta = (actual > expected) ? actual - expected :
+							  expected - actual;
+
+			total_delta += delta;
+		}
+
+		num_items = stats->num_observations +
+			    stats->num_new_observations;
+		/*
+		 * Heuristic: the cutoff is when the sum of absolute differences
+		 * of probabilities becomes at least 200/512.  As above, the
+		 * probability is multiplied by both num_new_observations and
+		 * num_observations.  Be careful to avoid integer overflow.
+		 */
+		cutoff = stats->num_new_observations * 200 / 512 *
+			 stats->num_observations;
+		/*
+		 * Very short blocks have a lot of overhead for the Huffman
+		 * codes, so only use them if it clearly seems worthwhile.
+		 * (This is an additional penalty, which adds to the smaller
+		 * penalty below which scales more slowly.)
+		 */
+		if (block_length < 10000 && num_items < 8192)
+			cutoff += (u64)cutoff * (8192 - num_items) / 8192;
+
+		/* Ready to end the block? */
+		if (total_delta +
+		    (block_length / 4096) * stats->num_observations >= cutoff)
+			return true;
+	}
+	merge_new_observations(stats);
+	return false;
+}
+
+static bool
+ready_to_check_block(const struct block_split_stats *stats,
+		     const u8 *in_block_begin, const u8 *in_next,
+		     const u8 *in_end)
+{
+	return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
+		&& in_next - in_block_begin >= MIN_BLOCK_LENGTH
+		&& in_end - in_next >= MIN_BLOCK_LENGTH;
+}
+
+static bool
+should_end_block(struct block_split_stats *stats,
+		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+	/* Ready to try to end the block (again)? */
+	if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
+		return false;
+
+	return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
+static void
+deflate_begin_sequences(struct libdeflate_compressor *c,
+			struct deflate_sequence *first_seq)
+{
+	deflate_reset_symbol_frequencies(c);
+	first_seq->litrunlen_and_length = 0;
+}
+
+static void
+deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
+		       bool gather_split_stats, struct deflate_sequence *seq)
+{
+	c->freqs.litlen[literal]++;
+
+	if (gather_split_stats)
+		observe_literal(&c->split_stats, literal);
+
+	STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
+	seq->litrunlen_and_length++;
+}
+
+static void
+deflate_choose_match(struct libdeflate_compressor *c,
+		     unsigned length, unsigned offset, bool gather_split_stats,
+		     struct deflate_sequence **seq_p)
+{
+	struct deflate_sequence *seq = *seq_p;
+	unsigned length_slot = deflate_length_slot[length];
+	unsigned offset_slot = deflate_get_offset_slot(offset);
+
+	c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
+	c->freqs.offset[offset_slot]++;
+	if (gather_split_stats)
+		observe_match(&c->split_stats, length);
+
+	seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT;
+	seq->offset = offset;
+	seq->offset_slot = offset_slot;
+
+	seq++;
+	seq->litrunlen_and_length = 0;
+	*seq_p = seq;
+}
+
+/*
+ * Decrease the maximum and nice match lengths if we're approaching the end of
+ * the input buffer.
+ */
+static void
+adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
+{
+	if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+		*max_len = remaining;
+		*nice_len = MIN(*nice_len, *max_len);
+	}
+}
+
+/*
+ * Choose the minimum match length for the greedy and lazy parsers.
+ *
+ * By default the minimum match length is 3, which is the smallest length the
+ * DEFLATE format allows.  However, with greedy and lazy parsing, some data
+ * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
+ * Typically, this is because literals are very cheap.  In general, the
+ * near-optimal parser handles this case naturally, but the greedy and lazy
+ * parsers need a heuristic to decide when to use short matches.
+ *
+ * The heuristic we use is to make the minimum match length depend on the number
+ * of different literals that exist in the data.  If there are many different
+ * literals, then literals will probably be expensive, so short matches will
+ * probably be worthwhile.  Conversely, if not many literals are used, then
+ * probably literals will be cheap and short matches won't be worthwhile.
+ */
+static unsigned
+choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
+{
+	/* map from num_used_literals to min_len */
+	static const u8 min_lens[] = {
+		9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+		/* The rest is implicitly 3. */
+	};
+	unsigned min_len;
+
+	STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
+	STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
+
+	if (num_used_literals >= ARRAY_LEN(min_lens))
+		return 3;
+	min_len = min_lens[num_used_literals];
+	/*
+	 * With a low max_search_depth, it may be too hard to find long matches.
+	 */
+	if (max_search_depth < 16) {
+		if (max_search_depth < 5)
+			min_len = MIN(min_len, 4);
+		else if (max_search_depth < 10)
+			min_len = MIN(min_len, 5);
+		else
+			min_len = MIN(min_len, 7);
+	}
+	return min_len;
+}
+
+static unsigned
+calculate_min_match_len(const u8 *data, size_t data_len,
+			unsigned max_search_depth)
+{
+	u8 used[256] = { 0 };
+	unsigned num_used_literals = 0;
+	size_t i;
+
+	/*
+	 * For very short inputs, the static Huffman code has a good chance of
+	 * being best, in which case there is no reason to avoid short matches.
+	 */
+	if (data_len < 512)
+		return DEFLATE_MIN_MATCH_LEN;
+
+	/*
+	 * For an initial approximation, scan the first 4 KiB of data.  The
+	 * caller may use recalculate_min_match_len() to update min_len later.
+	 */
+	data_len = MIN(data_len, 4096);
+	for (i = 0; i < data_len; i++)
+		used[data[i]] = 1;
+	for (i = 0; i < 256; i++)
+		num_used_literals += used[i];
+	return choose_min_match_len(num_used_literals, max_search_depth);
+}
+
+/*
+ * Recalculate the minimum match length for a block, now that we know the
+ * distribution of literals that are actually being used (freqs->litlen).
+ */
+static unsigned
+recalculate_min_match_len(const struct deflate_freqs *freqs,
+			  unsigned max_search_depth)
+{
+	u32 literal_freq = 0;
+	u32 cutoff;
+	unsigned num_used_literals = 0;
+	int i;
+
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		literal_freq += freqs->litlen[i];
+
+	cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
+
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (freqs->litlen[i] > cutoff)
+			num_used_literals++;
+	}
+	return choose_min_match_len(num_used_literals, max_search_depth);
+}
+
+static const u8 *
+choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
+		     size_t soft_max_len)
+{
+	if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
+		return in_end;
+	return in_block_begin + soft_max_len;
+}
+
+/*
+ * This is the level 0 "compressor".  It always outputs uncompressed blocks.
+ */
+static size_t
+deflate_compress_none(const u8 *in, size_t in_nbytes,
+		      u8 *out, size_t out_nbytes_avail)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in + in_nbytes;
+	u8 *out_next = out;
+	u8 * const out_end = out + out_nbytes_avail;
+
+	/*
+	 * If the input is zero-length, we still must output a block in order
+	 * for the output to be a valid DEFLATE stream.  Handle this case
+	 * specially to avoid potentially passing NULL to memcpy() below.
+	 */
+	if (unlikely(in_nbytes == 0)) {
+		if (out_nbytes_avail < 5)
+			return 0;
+		/* BFINAL and BTYPE */
+		*out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
+		/* LEN and NLEN */
+		put_unaligned_le32(0xFFFF0000, out_next);
+		return 5;
+	}
+
+	do {
+		u8 bfinal = 0;
+		size_t len = UINT16_MAX;
+
+		if (in_end - in_next <= UINT16_MAX) {
+			bfinal = 1;
+			len = in_end - in_next;
+		}
+		if (out_end - out_next < 5 + len)
+			return 0;
+		/*
+		 * Output BFINAL and BTYPE.  The stream is already byte-aligned
+		 * here, so this step always requires outputting exactly 1 byte.
+		 */
+		*out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
+
+		/* Output LEN and NLEN, then the data itself. */
+		put_unaligned_le16(len, out_next);
+		out_next += 2;
+		put_unaligned_le16(~len, out_next);
+		out_next += 2;
+		__builtin_memcpy(out_next, in_next, len);
+		out_next += len;
+		in_next += len;
+	} while (in_next != in_end);
+
+	return out_next - out;
+}
+
+/*
+ * This is a faster variant of deflate_compress_greedy().  It uses the
+ * ht_matchfinder rather than the hc_matchfinder.  It also skips the block
+ * splitting algorithm and just uses fixed length blocks.  c->max_search_depth
+ * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
+ */
+static void
+deflate_compress_fastest(struct libdeflate_compressor * restrict c,
+			 const u8 *in, size_t in_nbytes,
+			 struct deflate_output_bitstream *os)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	u32 next_hash = 0;
+
+	ht_matchfinder_init(&c->p.f.ht_mf);
+
+	do {
+		/* Starting a new DEFLATE block */
+
+		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
+		struct deflate_sequence *seq = c->p.f.sequences;
+
+		deflate_begin_sequences(c, seq);
+
+		do {
+			u32 length;
+			u32 offset;
+			size_t remaining = in_end - in_next;
+
+			if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+				max_len = remaining;
+				if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
+					do {
+						deflate_choose_literal(c,
+							*in_next++, false, seq);
+					} while (--max_len);
+					break;
+				}
+				nice_len = MIN(nice_len, max_len);
+			}
+			length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
+							      &in_cur_base,
+							      in_next,
+							      max_len,
+							      nice_len,
+							      &next_hash,
+							      &offset);
+			if (length) {
+				/* Match found */
+				deflate_choose_match(c, length, offset, false,
+						     &seq);
+				ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
+							  &in_cur_base,
+							  in_next + 1,
+							  in_end,
+							  length - 1,
+							  &next_hash);
+				in_next += length;
+			} else {
+				/* No match found */
+				deflate_choose_literal(c, *in_next++, false,
+						       seq);
+			}
+
+			/* Check if it's time to output another block. */
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
+
+		deflate_finish_block(c, os, in_block_begin,
+				     in_next - in_block_begin,
+				     c->p.f.sequences, in_next == in_end);
+	} while (in_next != in_end && !os->overflow);
+}
+
+/*
+ * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
+ */
+static void
+deflate_compress_greedy(struct libdeflate_compressor * restrict c,
+			const u8 *in, size_t in_nbytes,
+			struct deflate_output_bitstream *os)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	u32 next_hashes[2] = {0, 0};
+
+	hc_matchfinder_init(&c->p.g.hc_mf);
+
+	do {
+		/* Starting a new DEFLATE block */
+
+		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+		struct deflate_sequence *seq = c->p.g.sequences;
+		unsigned min_len;
+
+		init_block_split_stats(&c->split_stats);
+		deflate_begin_sequences(c, seq);
+		min_len = calculate_min_match_len(in_next,
+						  in_max_block_end - in_next,
+						  c->max_search_depth);
+		do {
+			u32 length;
+			u32 offset;
+
+			adjust_max_and_nice_len(&max_len, &nice_len,
+						in_end - in_next);
+			length = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next,
+						min_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth,
+						next_hashes,
+						&offset);
+
+			if (length >= min_len &&
+			    (length > DEFLATE_MIN_MATCH_LEN ||
+			     offset <= 4096)) {
+				/* Match found */
+				deflate_choose_match(c, length, offset, true,
+						     &seq);
+				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+							  &in_cur_base,
+							  in_next + 1,
+							  in_end,
+							  length - 1,
+							  next_hashes);
+				in_next += length;
+			} else {
+				/* No match found */
+				deflate_choose_literal(c, *in_next++, true,
+						       seq);
+			}
+
+			/* Check if it's time to output another block. */
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+			 !should_end_block(&c->split_stats,
+					   in_block_begin, in_next, in_end));
+
+		deflate_finish_block(c, os, in_block_begin,
+				     in_next - in_block_begin,
+				     c->p.g.sequences, in_next == in_end);
+	} while (in_next != in_end && !os->overflow);
+}
+
+static void
+deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
+			      const u8 *in, size_t in_nbytes,
+			      struct deflate_output_bitstream *os, bool lazy2)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	u32 next_hashes[2] = {0, 0};
+
+	hc_matchfinder_init(&c->p.g.hc_mf);
+
+	do {
+		/* Starting a new DEFLATE block */
+
+		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+		const u8 *next_recalc_min_len =
+			in_next + MIN(in_end - in_next, 10000);
+		struct deflate_sequence *seq = c->p.g.sequences;
+		unsigned min_len;
+
+		init_block_split_stats(&c->split_stats);
+		deflate_begin_sequences(c, seq);
+		min_len = calculate_min_match_len(in_next,
+						  in_max_block_end - in_next,
+						  c->max_search_depth);
+		do {
+			unsigned cur_len;
+			unsigned cur_offset;
+			unsigned next_len;
+			unsigned next_offset;
+
+			/*
+			 * Recalculate the minimum match length if it hasn't
+			 * been done recently.
+			 */
+			if (in_next >= next_recalc_min_len) {
+				min_len = recalculate_min_match_len(
+						&c->freqs,
+						c->max_search_depth);
+				next_recalc_min_len +=
+					MIN(in_end - next_recalc_min_len,
+					    in_next - in_block_begin);
+			}
+
+			/* Find the longest match at the current position. */
+			adjust_max_and_nice_len(&max_len, &nice_len,
+						in_end - in_next);
+			cur_len = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next,
+						min_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth,
+						next_hashes,
+						&cur_offset);
+			if (cur_len < min_len ||
+			    (cur_len == DEFLATE_MIN_MATCH_LEN &&
+			     cur_offset > 8192)) {
+				/* No match found.  Choose a literal. */
+				deflate_choose_literal(c, *in_next++, true,
+						       seq);
+				continue;
+			}
+			in_next++;
+
+have_cur_match:
+			/*
+			 * We have a match at the current position.
+			 * If it's very long, choose it immediately.
+			 */
+			if (cur_len >= nice_len) {
+				deflate_choose_match(c, cur_len, cur_offset,
+						     true, &seq);
+				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+							  &in_cur_base,
+							  in_next,
+							  in_end,
+							  cur_len - 1,
+							  next_hashes);
+				in_next += cur_len - 1;
+				continue;
+			}
+
+			/*
+			 * Try to find a better match at the next position.
+			 *
+			 * Note: since we already have a match at the *current*
+			 * position, we use only half the 'max_search_depth'
+			 * when checking the *next* position.  This is a useful
+			 * trade-off because it's more worthwhile to use a
+			 * greater search depth on the initial match.
+			 *
+			 * Note: it's possible to structure the code such that
+			 * there's only one call to longest_match(), which
+			 * handles both the "find the initial match" and "try to
+			 * find a better match" cases.  However, it is faster to
+			 * have two call sites, with longest_match() inlined at
+			 * each.
+			 */
+			adjust_max_and_nice_len(&max_len, &nice_len,
+						in_end - in_next);
+			next_len = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next++,
+						cur_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth >> 1,
+						next_hashes,
+						&next_offset);
+			if (next_len >= cur_len &&
+			    4 * (int)(next_len - cur_len) +
+			    ((int)bsr32(cur_offset) -
+			     (int)bsr32(next_offset)) > 2) {
+				/*
+				 * Found a better match at the next position.
+				 * Output a literal.  Then the next match
+				 * becomes the current match.
+				 */
+				deflate_choose_literal(c, *(in_next - 2), true,
+						       seq);
+				cur_len = next_len;
+				cur_offset = next_offset;
+				goto have_cur_match;
+			}
+
+			if (lazy2) {
+				/* In lazy2 mode, look ahead another position */
+				adjust_max_and_nice_len(&max_len, &nice_len,
+							in_end - in_next);
+				next_len = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next++,
+						cur_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth >> 2,
+						next_hashes,
+						&next_offset);
+				if (next_len >= cur_len &&
+				    4 * (int)(next_len - cur_len) +
+				    ((int)bsr32(cur_offset) -
+				     (int)bsr32(next_offset)) > 6) {
+					/*
+					 * There's a much better match two
+					 * positions ahead, so use two literals.
+					 */
+					deflate_choose_literal(
+						c, *(in_next - 3), true, seq);
+					deflate_choose_literal(
+						c, *(in_next - 2), true, seq);
+					cur_len = next_len;
+					cur_offset = next_offset;
+					goto have_cur_match;
+				}
+				/*
+				 * No better match at either of the next 2
+				 * positions.  Output the current match.
+				 */
+				deflate_choose_match(c, cur_len, cur_offset,
+						     true, &seq);
+				if (cur_len > 3) {
+					hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+								  &in_cur_base,
+								  in_next,
+								  in_end,
+								  cur_len - 3,
+								  next_hashes);
+					in_next += cur_len - 3;
+				}
+			} else { /* !lazy2 */
+				/*
+				 * No better match at the next position.  Output
+				 * the current match.
+				 */
+				deflate_choose_match(c, cur_len, cur_offset,
+						     true, &seq);
+				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+							  &in_cur_base,
+							  in_next,
+							  in_end,
+							  cur_len - 2,
+							  next_hashes);
+				in_next += cur_len - 2;
+			}
+			/* Check if it's time to output another block. */
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+			 !should_end_block(&c->split_stats,
+					   in_block_begin, in_next, in_end));
+
+		deflate_finish_block(c, os, in_block_begin,
+				     in_next - in_block_begin,
+				     c->p.g.sequences, in_next == in_end);
+	} while (in_next != in_end && !os->overflow);
+}
+
+/*
+ * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
+ * see if there's a better match at the next position.  If yes, it outputs a
+ * literal and continues to the next position.  If no, it outputs the match.
+ */
+static void
+deflate_compress_lazy(struct libdeflate_compressor * restrict c,
+		      const u8 *in, size_t in_nbytes,
+		      struct deflate_output_bitstream *os)
+{
+	deflate_compress_lazy_generic(c, in, in_nbytes, os, false);
+}
+
+/*
+ * The lazy2 compressor.  This is similar to the regular lazy one, but it looks
+ * for a better match at the next 2 positions rather than the next 1.  This
+ * makes it take slightly more time, but compress some inputs slightly more.
+ */
+static void
+deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
+		       const u8 *in, size_t in_nbytes,
+		       struct deflate_output_bitstream *os)
+{
+	deflate_compress_lazy_generic(c, in, in_nbytes, os, true);
+}
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * Follow the minimum-cost path in the graph of possible match/literal choices
+ * for the current block and compute the frequencies of the Huffman symbols that
+ * would be needed to output those matches and literals.
+ */
+static void
+deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
+{
+	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+
+	do {
+		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+
+		if (length == 1) {
+			/* Literal */
+			c->freqs.litlen[offset]++;
+		} else {
+			/* Match */
+			c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
+					deflate_length_slot[length]]++;
+			c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
+		}
+		cur_node += length;
+	} while (cur_node != end_node);
+
+	/* Tally the end-of-block symbol. */
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+}
+
+static void
+deflate_choose_all_literals(struct libdeflate_compressor *c,
+			    const u8 *block, u32 block_length)
+{
+	u32 i;
+
+	deflate_reset_symbol_frequencies(c);
+	for (i = 0; i < block_length; i++)
+		c->freqs.litlen[block[i]]++;
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+}
+
+/*
+ * Compute the exact cost, in bits, that would be required to output the matches
+ * and literals described by @c->freqs as a dynamic Huffman block.  The litlen
+ * and offset codes are assumed to have already been built in @c->codes.
+ */
+static u32
+deflate_compute_true_cost(struct libdeflate_compressor *c)
+{
+	u32 cost = 0;
+	unsigned sym;
+
+	deflate_precompute_huffman_header(c);
+
+	__builtin_memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0,
+	       DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms);
+
+	cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
+	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+		cost += c->o.precode.freqs[sym] *
+			(c->o.precode.lens[sym] +
+			 deflate_extra_precode_bits[sym]);
+	}
+
+	for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++)
+		cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym];
+
+	for (; sym < DEFLATE_FIRST_LEN_SYM +
+	       ARRAY_LEN(deflate_extra_length_bits); sym++)
+		cost += c->freqs.litlen[sym] *
+			(c->codes.lens.litlen[sym] +
+			 deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]);
+
+	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++)
+		cost += c->freqs.offset[sym] *
+			(c->codes.lens.offset[sym] +
+			 deflate_extra_offset_bits[sym]);
+	return cost;
+}
+
+/* Set the current cost model from the codeword lengths specified in @lens. */
+static void
+deflate_set_costs_from_codes(struct libdeflate_compressor *c,
+			     const struct deflate_lens *lens)
+{
+	unsigned i;
+
+	/* Literals */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		u32 bits = (lens->litlen[i] ?
+			    lens->litlen[i] : LITERAL_NOSTAT_BITS);
+
+		c->p.n.costs.literal[i] = bits * BIT_COST;
+	}
+
+	/* Lengths */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
+		unsigned length_slot = deflate_length_slot[i];
+		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
+		u32 bits = (lens->litlen[litlen_sym] ?
+			    lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+
+		bits += deflate_extra_length_bits[length_slot];
+		c->p.n.costs.length[i] = bits * BIT_COST;
+	}
+
+	/* Offset slots */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
+		u32 bits = (lens->offset[i] ?
+			    lens->offset[i] : OFFSET_NOSTAT_BITS);
+
+		bits += deflate_extra_offset_bits[i];
+		c->p.n.costs.offset_slot[i] = bits * BIT_COST;
+	}
+}
+
+/*
+ * This lookup table gives the default cost of a literal symbol and of a length
+ * symbol, depending on the characteristics of the input data.  It was generated
+ * by scripts/gen_default_litlen_costs.py.
+ *
+ * This table is indexed first by the estimated match probability:
+ *
+ *	i=0: data doesn't contain many matches	[match_prob=0.25]
+ *	i=1: neutral				[match_prob=0.50]
+ *	i=2: data contains lots of matches	[match_prob=0.75]
+ *
+ * This lookup produces a subtable which maps the number of distinct used
+ * literals to the default cost of a literal symbol, i.e.:
+ *
+ *	int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
+ *
+ * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
+ * accounts for literals usually getting cheaper as the number of distinct
+ * literals decreases, and as the proportion of literals to matches increases.
+ *
+ * The lookup also produces the cost of a length symbol, which is:
+ *
+ *	int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
+ *
+ * Note: we don't currently assign different costs to different literal symbols,
+ * or to different length symbols, as this is hard to do in a useful way.
+ */
+static const struct {
+	u8 used_lits_to_lit_cost[257];
+	u8 len_sym_cost;
+} default_litlen_costs[] = {
+	{ /* match_prob = 0.25 */
+		.used_lits_to_lit_cost = {
+			6, 6, 22, 32, 38, 43, 48, 51,
+			54, 57, 59, 61, 64, 65, 67, 69,
+			70, 72, 73, 74, 75, 76, 77, 79,
+			80, 80, 81, 82, 83, 84, 85, 85,
+			86, 87, 88, 88, 89, 89, 90, 91,
+			91, 92, 92, 93, 93, 94, 95, 95,
+			96, 96, 96, 97, 97, 98, 98, 99,
+			99, 99, 100, 100, 101, 101, 101, 102,
+			102, 102, 103, 103, 104, 104, 104, 105,
+			105, 105, 105, 106, 106, 106, 107, 107,
+			107, 108, 108, 108, 108, 109, 109, 109,
+			109, 110, 110, 110, 111, 111, 111, 111,
+			112, 112, 112, 112, 112, 113, 113, 113,
+			113, 114, 114, 114, 114, 114, 115, 115,
+			115, 115, 115, 116, 116, 116, 116, 116,
+			117, 117, 117, 117, 117, 118, 118, 118,
+			118, 118, 118, 119, 119, 119, 119, 119,
+			120, 120, 120, 120, 120, 120, 121, 121,
+			121, 121, 121, 121, 121, 122, 122, 122,
+			122, 122, 122, 123, 123, 123, 123, 123,
+			123, 123, 124, 124, 124, 124, 124, 124,
+			124, 125, 125, 125, 125, 125, 125, 125,
+			125, 126, 126, 126, 126, 126, 126, 126,
+			127, 127, 127, 127, 127, 127, 127, 127,
+			128, 128, 128, 128, 128, 128, 128, 128,
+			128, 129, 129, 129, 129, 129, 129, 129,
+			129, 129, 130, 130, 130, 130, 130, 130,
+			130, 130, 130, 131, 131, 131, 131, 131,
+			131, 131, 131, 131, 131, 132, 132, 132,
+			132, 132, 132, 132, 132, 132, 132, 133,
+			133, 133, 133, 133, 133, 133, 133, 133,
+			133, 134, 134, 134, 134, 134, 134, 134,
+			134,
+		},
+		.len_sym_cost = 109,
+	}, { /* match_prob = 0.5 */
+		.used_lits_to_lit_cost = {
+			16, 16, 32, 41, 48, 53, 57, 60,
+			64, 66, 69, 71, 73, 75, 76, 78,
+			80, 81, 82, 83, 85, 86, 87, 88,
+			89, 90, 91, 92, 92, 93, 94, 95,
+			96, 96, 97, 98, 98, 99, 99, 100,
+			101, 101, 102, 102, 103, 103, 104, 104,
+			105, 105, 106, 106, 107, 107, 108, 108,
+			108, 109, 109, 110, 110, 110, 111, 111,
+			112, 112, 112, 113, 113, 113, 114, 114,
+			114, 115, 115, 115, 115, 116, 116, 116,
+			117, 117, 117, 118, 118, 118, 118, 119,
+			119, 119, 119, 120, 120, 120, 120, 121,
+			121, 121, 121, 122, 122, 122, 122, 122,
+			123, 123, 123, 123, 124, 124, 124, 124,
+			124, 125, 125, 125, 125, 125, 126, 126,
+			126, 126, 126, 127, 127, 127, 127, 127,
+			128, 128, 128, 128, 128, 128, 129, 129,
+			129, 129, 129, 129, 130, 130, 130, 130,
+			130, 130, 131, 131, 131, 131, 131, 131,
+			131, 132, 132, 132, 132, 132, 132, 133,
+			133, 133, 133, 133, 133, 133, 134, 134,
+			134, 134, 134, 134, 134, 134, 135, 135,
+			135, 135, 135, 135, 135, 135, 136, 136,
+			136, 136, 136, 136, 136, 136, 137, 137,
+			137, 137, 137, 137, 137, 137, 138, 138,
+			138, 138, 138, 138, 138, 138, 138, 139,
+			139, 139, 139, 139, 139, 139, 139, 139,
+			140, 140, 140, 140, 140, 140, 140, 140,
+			140, 141, 141, 141, 141, 141, 141, 141,
+			141, 141, 141, 142, 142, 142, 142, 142,
+			142, 142, 142, 142, 142, 142, 143, 143,
+			143, 143, 143, 143, 143, 143, 143, 143,
+			144,
+		},
+		.len_sym_cost = 93,
+	}, { /* match_prob = 0.75 */
+		.used_lits_to_lit_cost = {
+			32, 32, 48, 57, 64, 69, 73, 76,
+			80, 82, 85, 87, 89, 91, 92, 94,
+			96, 97, 98, 99, 101, 102, 103, 104,
+			105, 106, 107, 108, 108, 109, 110, 111,
+			112, 112, 113, 114, 114, 115, 115, 116,
+			117, 117, 118, 118, 119, 119, 120, 120,
+			121, 121, 122, 122, 123, 123, 124, 124,
+			124, 125, 125, 126, 126, 126, 127, 127,
+			128, 128, 128, 129, 129, 129, 130, 130,
+			130, 131, 131, 131, 131, 132, 132, 132,
+			133, 133, 133, 134, 134, 134, 134, 135,
+			135, 135, 135, 136, 136, 136, 136, 137,
+			137, 137, 137, 138, 138, 138, 138, 138,
+			139, 139, 139, 139, 140, 140, 140, 140,
+			140, 141, 141, 141, 141, 141, 142, 142,
+			142, 142, 142, 143, 143, 143, 143, 143,
+			144, 144, 144, 144, 144, 144, 145, 145,
+			145, 145, 145, 145, 146, 146, 146, 146,
+			146, 146, 147, 147, 147, 147, 147, 147,
+			147, 148, 148, 148, 148, 148, 148, 149,
+			149, 149, 149, 149, 149, 149, 150, 150,
+			150, 150, 150, 150, 150, 150, 151, 151,
+			151, 151, 151, 151, 151, 151, 152, 152,
+			152, 152, 152, 152, 152, 152, 153, 153,
+			153, 153, 153, 153, 153, 153, 154, 154,
+			154, 154, 154, 154, 154, 154, 154, 155,
+			155, 155, 155, 155, 155, 155, 155, 155,
+			156, 156, 156, 156, 156, 156, 156, 156,
+			156, 157, 157, 157, 157, 157, 157, 157,
+			157, 157, 157, 158, 158, 158, 158, 158,
+			158, 158, 158, 158, 158, 158, 159, 159,
+			159, 159, 159, 159, 159, 159, 159, 159,
+			160,
+		},
+		.len_sym_cost = 84,
+	},
+};
+
+/*
+ * Choose the default costs for literal and length symbols.  These symbols are
+ * both part of the litlen alphabet.
+ */
+static void
+deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
+				    const u8 *block_begin, u32 block_length,
+				    u32 *lit_cost, u32 *len_sym_cost)
+{
+	unsigned num_used_literals = 0;
+	u32 literal_freq = block_length;
+	u32 match_freq = 0;
+	u32 cutoff;
+	u32 i;
+
+	/* Calculate the number of distinct literals that exist in the data. */
+	__builtin_memset(c->freqs.litlen, 0,
+	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+	cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
+	for (i = 0; i < block_length; i++)
+		c->freqs.litlen[block_begin[i]]++;
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (c->freqs.litlen[i] > cutoff)
+			num_used_literals++;
+	}
+	if (num_used_literals == 0)
+		num_used_literals = 1;
+
+	/*
+	 * Estimate the relative frequency of literals and matches in the
+	 * optimal parsing solution.  We don't know the optimal solution, so
+	 * this can only be a very rough estimate.  Therefore, we basically use
+	 * the match frequency from a greedy parse.  We also apply the min_len
+	 * heuristic used by the greedy and lazy parsers, to avoid counting too
+	 * many matches when literals are cheaper than short matches.
+	 */
+	match_freq = 0;
+	i = choose_min_match_len(num_used_literals, c->max_search_depth);
+	for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		match_freq += c->p.n.match_len_freqs[i];
+		literal_freq -= i * c->p.n.match_len_freqs[i];
+	}
+	if ((s32)literal_freq < 0) /* shouldn't happen */
+		literal_freq = 0;
+
+	if (match_freq > literal_freq)
+		i = 2; /* many matches */
+	else if (match_freq * 4 > literal_freq)
+		i = 1; /* neutral */
+	else
+		i = 0; /* few matches */
+
+	STATIC_ASSERT(BIT_COST == 16);
+	*lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
+							num_used_literals];
+	*len_sym_cost = default_litlen_costs[i].len_sym_cost;
+}
+
+static u32
+deflate_default_length_cost(unsigned len, u32 len_sym_cost)
+{
+	unsigned slot = deflate_length_slot[len];
+	u32 num_extra_bits = deflate_extra_length_bits[slot];
+
+	return len_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+static u32
+deflate_default_offset_slot_cost(unsigned slot)
+{
+	u32 num_extra_bits = deflate_extra_offset_bits[slot];
+	/*
+	 * Assume that all offset symbols are equally probable.
+	 * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
+	 * where 30 is the number of potentially-used offset symbols.
+	 */
+	u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
+
+	return offset_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+/* Set default symbol costs for the first block's first optimization pass. */
+static void
+deflate_set_default_costs(struct libdeflate_compressor *c,
+			  u32 lit_cost, u32 len_sym_cost)
+{
+	unsigned i;
+
+	/* Literals */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		c->p.n.costs.literal[i] = lit_cost;
+
+	/* Lengths */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		c->p.n.costs.length[i] =
+			deflate_default_length_cost(i, len_sym_cost);
+
+	/* Offset slots */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		c->p.n.costs.offset_slot[i] =
+			deflate_default_offset_slot_cost(i);
+}
+
+static void
+deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
+{
+	if (change_amount == 0)
+		/* Block is very similar to previous; prefer previous costs. */
+		*cost_p = (default_cost + 3 * *cost_p) / 4;
+	else if (change_amount == 1)
+		*cost_p = (default_cost + *cost_p) / 2;
+	else if (change_amount == 2)
+		*cost_p = (5 * default_cost + 3 * *cost_p) / 8;
+	else
+		/* Block differs greatly from previous; prefer default costs. */
+		*cost_p = (3 * default_cost + *cost_p) / 4;
+}
+
+static void
+deflate_adjust_costs_impl(struct libdeflate_compressor *c,
+			  u32 lit_cost, u32 len_sym_cost, int change_amount)
+{
+	unsigned i;
+
+	/* Literals */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
+				    change_amount);
+
+	/* Lengths */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		deflate_adjust_cost(&c->p.n.costs.length[i],
+				    deflate_default_length_cost(i,
+								len_sym_cost),
+				    change_amount);
+
+	/* Offset slots */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
+				    deflate_default_offset_slot_cost(i),
+				    change_amount);
+}
+
+/*
+ * Adjust the costs when beginning a new block.
+ *
+ * Since the current costs are optimized for the data already, it can be helpful
+ * to reuse them instead of starting over with the default costs.  However, this
+ * depends on how similar the new block is to the previous block.  Therefore,
+ * use a heuristic to decide how similar the blocks are, and mix together the
+ * current costs and the default costs accordingly.
+ */
+static void
+deflate_adjust_costs(struct libdeflate_compressor *c,
+		     u32 lit_cost, u32 len_sym_cost)
+{
+	u64 total_delta = 0;
+	u64 cutoff;
+	int i;
+
+	/*
+	 * Decide how different the current block is from the previous block,
+	 * using the block splitting statistics from the current and previous
+	 * blocks.  The more different the current block is, the more we prefer
+	 * the default costs rather than the previous block's costs.
+	 *
+	 * The algorithm here is similar to the end-of-block check one, but here
+	 * we compare two entire blocks rather than a partial block with a small
+	 * extra part, and therefore we need 64-bit numbers in some places.
+	 */
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		u64 prev = (u64)c->p.n.prev_observations[i] *
+			    c->split_stats.num_observations;
+		u64 cur = (u64)c->split_stats.observations[i] *
+			  c->p.n.prev_num_observations;
+
+		total_delta += prev > cur ? prev - cur : cur - prev;
+	}
+	cutoff = ((u64)c->p.n.prev_num_observations *
+		  c->split_stats.num_observations * 200) / 512;
+
+	if (total_delta > 3 * cutoff)
+		/* Big change in the data; just use the default costs. */
+		deflate_set_default_costs(c, lit_cost, len_sym_cost);
+	else if (4 * total_delta > 9 * cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
+	else if (2 * total_delta > 3 * cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
+	else if (2 * total_delta > cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
+	else
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
+}
+
+static void
+deflate_set_initial_costs(struct libdeflate_compressor *c,
+			  const u8 *block_begin, u32 block_length,
+			  bool is_first_block)
+{
+	u32 lit_cost, len_sym_cost;
+
+	deflate_choose_default_litlen_costs(c, block_begin, block_length,
+					    &lit_cost, &len_sym_cost);
+	if (is_first_block)
+		deflate_set_default_costs(c, lit_cost, len_sym_cost);
+	else
+		deflate_adjust_costs(c, lit_cost, len_sym_cost);
+}
+
+/*
+ * Find the minimum-cost path through the graph of possible match/literal
+ * choices for this block.
+ *
+ * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
+ * represents the node at the beginning of the block, to
+ * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
+ * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
+ *
+ * The algorithm works backwards, starting at the end node and proceeding
+ * backwards one node at a time.  At each node, the minimum cost to reach the
+ * end node is computed and the match/literal choice that begins that path is
+ * saved.
+ */
+static void
+deflate_find_min_cost_path(struct libdeflate_compressor *c,
+			   const u32 block_length,
+			   const struct lz_match *cache_ptr)
+{
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+	struct deflate_optimum_node *cur_node = end_node;
+
+	cur_node->cost_to_end = 0;
+	do {
+		unsigned num_matches;
+		unsigned literal;
+		u32 best_cost_to_end;
+
+		cur_node--;
+		cache_ptr--;
+
+		num_matches = cache_ptr->length;
+		literal = cache_ptr->offset;
+
+		/* It's always possible to choose a literal. */
+		best_cost_to_end = c->p.n.costs.literal[literal] +
+				   (cur_node + 1)->cost_to_end;
+		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+
+		/* Also consider matches if there are any. */
+		if (num_matches) {
+			const struct lz_match *match;
+			unsigned len;
+			unsigned offset;
+			unsigned offset_slot;
+			u32 offset_cost;
+			u32 cost_to_end;
+
+			/*
+			 * Consider each length from the minimum
+			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
+			 * match found at this position.  For each length, we
+			 * consider only the smallest offset for which that
+			 * length is available.  Although this is not guaranteed
+			 * to be optimal due to the possibility of a larger
+			 * offset costing less than a smaller offset to code,
+			 * this is a very useful heuristic.
+			 */
+			match = cache_ptr - num_matches;
+			len = DEFLATE_MIN_MATCH_LEN;
+			do {
+				offset = match->offset;
+				offset_slot = c->p.n.offset_slot_full[offset];
+				offset_cost =
+					c->p.n.costs.offset_slot[offset_slot];
+				do {
+					cost_to_end = offset_cost +
+						c->p.n.costs.length[len] +
+						(cur_node + len)->cost_to_end;
+					if (cost_to_end < best_cost_to_end) {
+						best_cost_to_end = cost_to_end;
+						cur_node->item = len |
+							((u32)offset <<
+							 OPTIMUM_OFFSET_SHIFT);
+					}
+				} while (++len <= match->length);
+			} while (++match != cache_ptr);
+			cache_ptr -= num_matches;
+		}
+		cur_node->cost_to_end = best_cost_to_end;
+	} while (cur_node != &c->p.n.optimum_nodes[0]);
+
+	deflate_reset_symbol_frequencies(c);
+	deflate_tally_item_list(c, block_length);
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+}
+
+/*
+ * Choose the literals and matches for the current block, then output the block.
+ *
+ * To choose the literal/match sequence, we find the minimum-cost path through
+ * the block's graph of literal/match choices, given a cost model.  However, the
+ * true cost of each symbol is unknown until the Huffman codes have been built,
+ * but at the same time the Huffman codes depend on the frequencies of chosen
+ * symbols.  Consequently, multiple passes must be used to try to approximate an
+ * optimal solution.  The first pass uses default costs, mixed with the costs
+ * from the previous block when it seems appropriate.  Later passes use the
+ * Huffman codeword lengths from the previous pass as the costs.
+ *
+ * As an alternate strategy, also consider using only literals.  The boolean
+ * returned in *used_only_literals indicates whether that strategy was best.
+ */
+static void
+deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
+				 struct deflate_output_bitstream *os,
+				 const u8 *block_begin, u32 block_length,
+				 const struct lz_match *cache_ptr,
+				 bool is_first_block, bool is_final_block,
+				 bool *used_only_literals)
+{
+	unsigned num_passes_remaining = c->p.n.max_optim_passes;
+	u32 best_true_cost = UINT32_MAX;
+	u32 true_cost;
+	u32 only_lits_cost;
+	u32 static_cost = UINT32_MAX;
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq = NULL;
+	u32 i;
+
+	/*
+	 * On some data, using only literals (no matches) ends up being better
+	 * than what the iterative optimization algorithm produces.  Therefore,
+	 * consider using only literals.
+	 */
+	deflate_choose_all_literals(c, block_begin, block_length);
+	only_lits_cost = deflate_compute_true_cost(c);
+
+	/*
+	 * Force the block to really end at the desired length, even if some
+	 * matches extend beyond it.
+	 */
+	for (i = block_length;
+	     i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
+
+	/*
+	 * Sometimes a static Huffman block ends up being cheapest, particularly
+	 * if the block is small.  So, if the block is sufficiently small, find
+	 * the optimal static block solution and remember its cost.
+	 */
+	if (block_length <= c->p.n.max_len_to_optimize_static_block) {
+		/* Save c->p.n.costs temporarily. */
+		c->p.n.costs_saved = c->p.n.costs;
+
+		deflate_set_costs_from_codes(c, &c->static_codes.lens);
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
+		static_cost += 7; /* for the end-of-block symbol */
+
+		/* Restore c->p.n.costs. */
+		c->p.n.costs = c->p.n.costs_saved;
+	}
+
+	/* Initialize c->p.n.costs with default costs. */
+	deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
+
+	do {
+		/*
+		 * Find the minimum-cost path for this pass.
+		 * Also set c->freqs and c->codes to match the path.
+		 */
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
+
+		/*
+		 * Compute the exact cost of the block if the path were to be
+		 * used.  Note that this differs from
+		 * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
+		 * the actual Huffman codes instead of c->p.n.costs.
+		 */
+		true_cost = deflate_compute_true_cost(c);
+
+		/*
+		 * If the cost didn't improve much from the previous pass, then
+		 * doing more passes probably won't be helpful, so stop early.
+		 */
+		if (true_cost + c->p.n.min_improvement_to_continue >
+		    best_true_cost)
+			break;
+
+		best_true_cost = true_cost;
+
+		/* Save the cost model that gave 'best_true_cost'. */
+		c->p.n.costs_saved = c->p.n.costs;
+
+		/* Update the cost model from the Huffman codes. */
+		deflate_set_costs_from_codes(c, &c->codes.lens);
+
+	} while (--num_passes_remaining);
+
+	*used_only_literals = false;
+	if (MIN(only_lits_cost, static_cost) < best_true_cost) {
+		if (only_lits_cost < static_cost) {
+			/* Using only literals ended up being best! */
+			deflate_choose_all_literals(c, block_begin, block_length);
+			deflate_set_costs_from_codes(c, &c->codes.lens);
+			seq_.litrunlen_and_length = block_length;
+			seq = &seq_;
+			*used_only_literals = true;
+		} else {
+			/* Static block ended up being best! */
+			deflate_set_costs_from_codes(c, &c->static_codes.lens);
+			deflate_find_min_cost_path(c, block_length, cache_ptr);
+		}
+	} else if (true_cost >=
+		   best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
+		/*
+		 * The best solution was actually from a non-final optimization
+		 * pass, so recover and use the min-cost path from that pass.
+		 */
+		c->p.n.costs = c->p.n.costs_saved;
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		deflate_set_costs_from_codes(c, &c->codes.lens);
+	}
+	deflate_flush_block(c, os, block_begin, block_length, seq,
+			    is_final_block);
+}
+
+static void
+deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
+{
+	init_block_split_stats(&c->split_stats);
+	__builtin_memset(c->p.n.new_match_len_freqs, 0,
+	       sizeof(c->p.n.new_match_len_freqs));
+	__builtin_memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+static void
+deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
+{
+	unsigned i;
+
+	merge_new_observations(&c->split_stats);
+	for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
+		c->p.n.new_match_len_freqs[i] = 0;
+	}
+}
+
+/*
+ * Save some literal/match statistics from the previous block so that
+ * deflate_adjust_costs() will be able to decide how much the current block
+ * differs from the previous one.
+ */
+static void
+deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->p.n.prev_observations[i] = c->split_stats.observations[i];
+	c->p.n.prev_num_observations = c->split_stats.num_observations;
+}
+
+static void
+deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->split_stats.observations[i] = 0;
+	c->split_stats.num_observations = 0;
+	__builtin_memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+/*
+ * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
+ * representation of each DEFLATE block using a minimum-cost path search over
+ * the graph of possible match/literal choices for that block, assuming a
+ * certain cost for each Huffman symbol.
+ *
+ * For several reasons, the end result is not guaranteed to be optimal:
+ *
+ * - Nonoptimal choice of blocks
+ * - Heuristic limitations on which matches are actually considered
+ * - Symbol costs are unknown until the symbols have already been chosen
+ *   (so iterative optimization must be used)
+ */
+static void
+deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
+			      const u8 *in, size_t in_nbytes,
+			      struct deflate_output_bitstream *os)
+{
+	const u8 *in_next = in;
+	const u8 *in_block_begin = in_next;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	const u8 *in_next_slide =
+		in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	struct lz_match *cache_ptr = c->p.n.match_cache;
+	u32 next_hashes[2] = {0, 0};
+	bool prev_block_used_only_literals = false;
+
+	bt_matchfinder_init(&c->p.n.bt_mf);
+	deflate_near_optimal_init_stats(c);
+
+	do {
+		/* Starting a new DEFLATE block */
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+		const u8 *prev_end_block_check = NULL;
+		bool change_detected = false;
+		const u8 *next_observation = in_next;
+		unsigned min_len;
+
+		/*
+		 * Use the minimum match length heuristic to improve the
+		 * literal/match statistics gathered during matchfinding.
+		 * However, the actual near-optimal parse won't respect min_len,
+		 * as it can accurately assess the costs of different matches.
+		 *
+		 * If the "use only literals" strategy happened to be the best
+		 * strategy on the previous block, then probably the
+		 * min_match_len heuristic is still not aggressive enough for
+		 * the data, so force gathering literal stats only.
+		 */
+		if (prev_block_used_only_literals)
+			min_len = DEFLATE_MAX_MATCH_LEN + 1;
+		else
+			min_len = calculate_min_match_len(
+					in_block_begin,
+					in_max_block_end - in_block_begin,
+					c->max_search_depth);
+
+		/*
+		 * Find matches until we decide to end the block.  We end the
+		 * block if any of the following is true:
+		 *
+		 * (1) Maximum block length has been reached
+		 * (2) Match catch may overflow.
+		 * (3) Block split heuristic says to split now.
+		 */
+		for (;;) {
+			struct lz_match *matches;
+			unsigned best_len;
+			size_t remaining = in_end - in_next;
+
+			/* Slide the window forward if needed. */
+			if (in_next == in_next_slide) {
+				bt_matchfinder_slide_window(&c->p.n.bt_mf);
+				in_cur_base = in_next;
+				in_next_slide = in_next +
+					MIN(remaining, MATCHFINDER_WINDOW_SIZE);
+			}
+
+			/*
+			 * Find matches with the current position using the
+			 * binary tree matchfinder and save them in match_cache.
+			 *
+			 * Note: the binary tree matchfinder is more suited for
+			 * optimal parsing than the hash chain matchfinder.  The
+			 * reasons for this include:
+			 *
+			 * - The binary tree matchfinder can find more matches
+			 *   in the same number of steps.
+			 * - One of the major advantages of hash chains is that
+			 *   skipping positions (not searching for matches at
+			 *   them) is faster; however, with optimal parsing we
+			 *   search for matches at almost all positions, so this
+			 *   advantage of hash chains is negated.
+			 */
+			matches = cache_ptr;
+			best_len = 0;
+			adjust_max_and_nice_len(&max_len, &nice_len, remaining);
+			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
+				cache_ptr = bt_matchfinder_get_matches(
+						&c->p.n.bt_mf,
+						in_cur_base,
+						in_next - in_cur_base,
+						max_len,
+						nice_len,
+						c->max_search_depth,
+						next_hashes,
+						matches);
+				if (cache_ptr > matches)
+					best_len = cache_ptr[-1].length;
+			}
+			if (in_next >= next_observation) {
+				if (best_len >= min_len) {
+					observe_match(&c->split_stats,
+						      best_len);
+					next_observation = in_next + best_len;
+					c->p.n.new_match_len_freqs[best_len]++;
+				} else {
+					observe_literal(&c->split_stats,
+							*in_next);
+					next_observation = in_next + 1;
+				}
+			}
+
+			cache_ptr->length = cache_ptr - matches;
+			cache_ptr->offset = *in_next;
+			in_next++;
+			cache_ptr++;
+
+			/*
+			 * If there was a very long match found, don't cache any
+			 * matches for the bytes covered by that match.  This
+			 * avoids degenerate behavior when compressing highly
+			 * redundant data, where the number of matches can be
+			 * very large.
+			 *
+			 * This heuristic doesn't actually hurt the compression
+			 * ratio very much.  If there's a long match, then the
+			 * data must be highly compressible, so it doesn't
+			 * matter much what we do.
+			 */
+			if (best_len >= DEFLATE_MIN_MATCH_LEN &&
+			    best_len >= nice_len) {
+				--best_len;
+				do {
+					remaining = in_end - in_next;
+					if (in_next == in_next_slide) {
+						bt_matchfinder_slide_window(
+							&c->p.n.bt_mf);
+						in_cur_base = in_next;
+						in_next_slide = in_next +
+							MIN(remaining,
+							    MATCHFINDER_WINDOW_SIZE);
+					}
+					adjust_max_and_nice_len(&max_len,
+								&nice_len,
+								remaining);
+					if (max_len >=
+					    BT_MATCHFINDER_REQUIRED_NBYTES) {
+						bt_matchfinder_skip_byte(
+							&c->p.n.bt_mf,
+							in_cur_base,
+							in_next - in_cur_base,
+							nice_len,
+							c->max_search_depth,
+							next_hashes);
+					}
+					cache_ptr->length = 0;
+					cache_ptr->offset = *in_next;
+					in_next++;
+					cache_ptr++;
+				} while (--best_len);
+			}
+			/* Maximum block length or end of input reached? */
+			if (in_next >= in_max_block_end)
+				break;
+			/* Match cache overflowed? */
+			if (cache_ptr >=
+			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+				break;
+			/* Not ready to try to end the block (again)? */
+			if (!ready_to_check_block(&c->split_stats,
+						  in_block_begin, in_next,
+						  in_end))
+				continue;
+			/* Check if it would be worthwhile to end the block. */
+			if (do_end_block_check(&c->split_stats,
+					       in_next - in_block_begin)) {
+				change_detected = true;
+				break;
+			}
+			/* Ending the block doesn't seem worthwhile here. */
+			deflate_near_optimal_merge_stats(c);
+			prev_end_block_check = in_next;
+		}
+		/*
+		 * All the matches for this block have been cached.  Now choose
+		 * the precise end of the block and the sequence of items to
+		 * output to represent it, then flush the block.
+		 */
+		if (change_detected && prev_end_block_check != NULL) {
+			/*
+			 * The block is being ended because a recent chunk of
+			 * data differs from the rest of the block.  We could
+			 * end the block at 'in_next' like the greedy and lazy
+			 * compressors do, but that's not ideal since it would
+			 * include the differing chunk in the block.  The
+			 * near-optimal compressor has time to do a better job.
+			 * Therefore, we rewind to just before the chunk, and
+			 * output a block that only goes up to there.
+			 *
+			 * We then set things up to correctly start the next
+			 * block, considering that some work has already been
+			 * done on it (some matches found and stats gathered).
+			 */
+			struct lz_match *orig_cache_ptr = cache_ptr;
+			const u8 *in_block_end = prev_end_block_check;
+			u32 block_length = in_block_end - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = false;
+			u32 num_bytes_to_rewind = in_next - in_block_end;
+			size_t cache_len_rewound;
+
+			/* Rewind the match cache. */
+			do {
+				cache_ptr--;
+				cache_ptr -= cache_ptr->length;
+			} while (--num_bytes_to_rewind);
+			cache_len_rewound = orig_cache_ptr - cache_ptr;
+
+			deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						block_length, cache_ptr,
+						is_first, is_final,
+						&prev_block_used_only_literals);
+			__builtin_memmove(c->p.n.match_cache, cache_ptr,
+				cache_len_rewound * sizeof(*cache_ptr));
+			cache_ptr = &c->p.n.match_cache[cache_len_rewound];
+			deflate_near_optimal_save_stats(c);
+			/*
+			 * Clear the stats for the just-flushed block, leaving
+			 * just the stats for the beginning of the next block.
+			 */
+			deflate_near_optimal_clear_old_stats(c);
+			in_block_begin = in_block_end;
+		} else {
+			/*
+			 * The block is being ended for a reason other than a
+			 * differing data chunk being detected.  Don't rewind at
+			 * all; just end the block at the current position.
+			 */
+			u32 block_length = in_next - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = (in_next == in_end);
+
+			deflate_near_optimal_merge_stats(c);
+			deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						block_length, cache_ptr,
+						is_first, is_final,
+						&prev_block_used_only_literals);
+			cache_ptr = &c->p.n.match_cache[0];
+			deflate_near_optimal_save_stats(c);
+			deflate_near_optimal_init_stats(c);
+			in_block_begin = in_next;
+		}
+	} while (in_next != in_end && !os->overflow);
+}
+
+/* Initialize c->p.n.offset_slot_full. */
+static void
+deflate_init_offset_slot_full(struct libdeflate_compressor *c)
+{
+	unsigned offset_slot;
+	unsigned offset;
+	unsigned offset_end;
+
+	for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
+	     offset_slot++) {
+		offset = deflate_offset_slot_base[offset_slot];
+		offset_end = offset +
+			     (1 << deflate_extra_offset_bits[offset_slot]);
+		do {
+			c->p.n.offset_slot_full[offset] = offset_slot;
+		} while (++offset != offset_end);
+	}
+}
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+struct libdeflate_compressor *
+libdeflate_alloc_compressor_ex(int compression_level,
+			       const struct libdeflate_options *options)
+{
+	struct libdeflate_compressor *c;
+	size_t size = offsetof(struct libdeflate_compressor, p);
+
+	check_buildtime_parameters();
+
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
+	if (compression_level < 0 || compression_level > 12)
+		return NULL;
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	if (compression_level >= 10)
+		size += sizeof(c->p.n);
+	else
+#endif
+	{
+		if (compression_level >= 2)
+			size += sizeof(c->p.g);
+		else if (compression_level == 1)
+			size += sizeof(c->p.f);
+	}
+
+	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
+	if (!c)
+		return NULL;
+
+	c->compression_level = compression_level;
+
+	/*
+	 * The higher the compression level, the more we should bother trying to
+	 * compress very small inputs.
+	 */
+	c->max_passthrough_size = 55 - (compression_level * 4);
+
+	switch (compression_level) {
+	case 0:
+		c->max_passthrough_size = SIZE_MAX;
+		c->impl = NULL; /* not used */
+		break;
+	case 1:
+		c->impl = deflate_compress_fastest;
+		/* max_search_depth is unused. */
+		c->nice_match_length = 32;
+		break;
+	case 2:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 6;
+		c->nice_match_length = 10;
+		break;
+	case 3:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 12;
+		c->nice_match_length = 14;
+		break;
+	case 4:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 16;
+		c->nice_match_length = 30;
+		break;
+	case 5:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 16;
+		c->nice_match_length = 30;
+		break;
+	case 6:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 35;
+		c->nice_match_length = 65;
+		break;
+	case 7:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 100;
+		c->nice_match_length = 130;
+		break;
+	case 8:
+		c->impl = deflate_compress_lazy2;
+		c->max_search_depth = 300;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		break;
+	case 9:
+#if !SUPPORT_NEAR_OPTIMAL_PARSING
+	default:
+#endif
+		c->impl = deflate_compress_lazy2;
+		c->max_search_depth = 600;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		break;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	case 10:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 35;
+		c->nice_match_length = 75;
+		c->p.n.max_optim_passes = 2;
+		c->p.n.min_improvement_to_continue = 32;
+		c->p.n.min_bits_to_use_nonfinal_path = 32;
+		c->p.n.max_len_to_optimize_static_block = 0;
+		deflate_init_offset_slot_full(c);
+		break;
+	case 11:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 100;
+		c->nice_match_length = 150;
+		c->p.n.max_optim_passes = 4;
+		c->p.n.min_improvement_to_continue = 16;
+		c->p.n.min_bits_to_use_nonfinal_path = 16;
+		c->p.n.max_len_to_optimize_static_block = 1000;
+		deflate_init_offset_slot_full(c);
+		break;
+	case 12:
+	default:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 300;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		c->p.n.max_optim_passes = 10;
+		c->p.n.min_improvement_to_continue = 1;
+		c->p.n.min_bits_to_use_nonfinal_path = 1;
+		c->p.n.max_len_to_optimize_static_block = 10000;
+		deflate_init_offset_slot_full(c);
+		break;
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+	}
+
+	deflate_init_static_codes(c);
+
+	return c;
+}
+
+
+struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level)
+{
+	static const struct libdeflate_options defaults = {
+		.sizeof_options = sizeof(defaults),
+	};
+	return libdeflate_alloc_compressor_ex(compression_level, &defaults);
+}
+
+size_t
+libdeflate_deflate_compress(struct libdeflate_compressor *c,
+			    const void *in, size_t in_nbytes,
+			    void *out, size_t out_nbytes_avail)
+{
+	struct deflate_output_bitstream os;
+
+	/*
+	 * For extremely short inputs, or for compression level 0, just output
+	 * uncompressed blocks.
+	 */
+	if (unlikely(in_nbytes <= c->max_passthrough_size))
+		return deflate_compress_none(in, in_nbytes,
+					     out, out_nbytes_avail);
+
+	/* Initialize the output bitstream structure. */
+	os.bitbuf = 0;
+	os.bitcount = 0;
+	os.next = out;
+	os.end = os.next + out_nbytes_avail;
+	os.overflow = false;
+
+	/* Call the actual compression function. */
+	(*c->impl)(c, in, in_nbytes, &os);
+
+	/* Return 0 if the output buffer is too small. */
+	if (os.overflow)
+		return 0;
+
+	/*
+	 * Write the final byte if needed.  This can't overflow the output
+	 * buffer because deflate_flush_block() would have set the overflow flag
+	 * if there wasn't enough space remaining for the full final block.
+	 */
+	ASSERT(os.bitcount <= 7);
+	if (os.bitcount) {
+		ASSERT(os.next < os.end);
+		*os.next++ = os.bitbuf;
+	}
+
+	/* Return the compressed size in bytes. */
+	return os.next - (u8 *)out;
+}
+
+void
+libdeflate_free_compressor(struct libdeflate_compressor *c)
+{
+	if (c)
+		libdeflate_aligned_free(c);
+}
+
+unsigned int
+libdeflate_get_compression_level(struct libdeflate_compressor *c)
+{
+	return c->compression_level;
+}
+
+size_t
+libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
+				  size_t in_nbytes)
+{
+	size_t max_blocks;
+
+	/*
+	 * Since the compressor never uses a compressed block when an
+	 * uncompressed block is cheaper, the worst case can be no worse than
+	 * the case where only uncompressed blocks are used.
+	 *
+	 * This is true even though up to 7 bits are "wasted" to byte-align the
+	 * bitstream when a compressed block is followed by an uncompressed
+	 * block.  This is because a compressed block wouldn't have been used if
+	 * it wasn't cheaper than an uncompressed block, and uncompressed blocks
+	 * always end on a byte boundary.  So the alignment bits will, at worst,
+	 * go up to the place where the uncompressed block would have ended.
+	 */
+
+	/*
+	 * Calculate the maximum number of uncompressed blocks that the
+	 * compressor can use for 'in_nbytes' of data.
+	 *
+	 * The minimum length that is passed to deflate_flush_block() is
+	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
+	 * deflate_flush_block() decides to use an uncompressed block, it
+	 * actually will (in general) output a series of uncompressed blocks in
+	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
+	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
+	 * as in that case this behavior can't result in more blocks than the
+	 * case where deflate_flush_block() is called with min-length inputs.
+	 *
+	 * So the number of uncompressed blocks needed would be bounded by
+	 * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
+	 * need 1 (empty) block, which gives the final expression below.
+	 */
+	STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
+	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+
+	/*
+	 * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
+	 * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
+	 * alignment bits at the very start of the block can be disregarded;
+	 * they would otherwise increase the overhead to 6 bytes per block.)
+	 * Therefore, the maximum number of overhead bytes is '5 * max_blocks'.
+	 * To get the final bound, add the number of uncompressed bytes.
+	 */
+	return (5 * max_blocks) + in_nbytes;
+}
diff --git a/packages/wasm/lib/libdeflate/deflate_compress.h b/packages/wasm/lib/libdeflate/deflate_compress.h
new file mode 100644
index 00000000..bd7a89f9
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/deflate_compress.h
@@ -0,0 +1,20 @@
+#ifndef LIB_DEFLATE_COMPRESS_H
+#define LIB_DEFLATE_COMPRESS_H
+
+#include "lib_common.h"
+
+/*
+ * DEFLATE compression is private to deflate_compress.c, but we do need to be
+ * able to query the compression level for zlib and gzip header generation.
+ */
+
+struct libdeflate_compressor;
+
+unsigned int libdeflate_get_compression_level(struct libdeflate_compressor *c);
+size_t libdeflate_deflate_compress(struct libdeflate_compressor *c,
+          const void *in, size_t in_nbytes,
+          void *out, size_t out_nbytes_avail);
+
+size_t libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes);
+
+#endif /* LIB_DEFLATE_COMPRESS_H */
diff --git a/packages/wasm/lib/libdeflate/deflate_constants.h b/packages/wasm/lib/libdeflate/deflate_constants.h
new file mode 100644
index 00000000..95c9e0a5
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/deflate_constants.h
@@ -0,0 +1,56 @@
+/*
+ * deflate_constants.h - constants for the DEFLATE compression format
+ */
+
+#ifndef LIB_DEFLATE_CONSTANTS_H
+#define LIB_DEFLATE_CONSTANTS_H
+
+/* Valid block types  */
+#define DEFLATE_BLOCKTYPE_UNCOMPRESSED		0
+#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN	1
+#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN	2
+
+/* Minimum and maximum supported match lengths (in bytes)  */
+#define DEFLATE_MIN_MATCH_LEN			3
+#define DEFLATE_MAX_MATCH_LEN			258
+
+/* Maximum supported match offset (in bytes) */
+#define DEFLATE_MAX_MATCH_OFFSET		32768
+
+/* log2 of DEFLATE_MAX_MATCH_OFFSET */
+#define DEFLATE_WINDOW_ORDER			15
+
+/* Number of symbols in each Huffman code.  Note: for the literal/length
+ * and offset codes, these are actually the maximum values; a given block
+ * might use fewer symbols.  */
+#define DEFLATE_NUM_PRECODE_SYMS		19
+#define DEFLATE_NUM_LITLEN_SYMS			288
+#define DEFLATE_NUM_OFFSET_SYMS			32
+
+/* The maximum number of symbols across all codes  */
+#define DEFLATE_MAX_NUM_SYMS			288
+
+/* Division of symbols in the literal/length code  */
+#define DEFLATE_NUM_LITERALS			256
+#define DEFLATE_END_OF_BLOCK			256
+#define DEFLATE_FIRST_LEN_SYM			257
+
+/* Maximum codeword length, in bits, within each Huffman code  */
+#define DEFLATE_MAX_PRE_CODEWORD_LEN		7
+#define DEFLATE_MAX_LITLEN_CODEWORD_LEN		15
+#define DEFLATE_MAX_OFFSET_CODEWORD_LEN		15
+
+/* The maximum codeword length across all codes  */
+#define DEFLATE_MAX_CODEWORD_LEN		15
+
+/* Maximum possible overrun when decoding codeword lengths  */
+#define DEFLATE_MAX_LENS_OVERRUN		137
+
+/*
+ * Maximum number of extra bits that may be required to represent a match
+ * length or offset.
+ */
+#define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS		13
+
+#endif /* LIB_DEFLATE_CONSTANTS_H */
diff --git a/packages/wasm/lib/libdeflate/deflate_decompress.c b/packages/wasm/lib/libdeflate/deflate_decompress.c
new file mode 100644
index 00000000..07aaa442
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/deflate_decompress.c
@@ -0,0 +1,1200 @@
+/*
+ * deflate_decompress.c - a decompressor for DEFLATE
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a highly optimized DEFLATE decompressor.  It is much faster than
+ * vanilla zlib, typically well over twice as fast, though results vary by CPU.
+ *
+ * Why this is faster than vanilla zlib:
+ *
+ * - Word accesses rather than byte accesses when reading input
+ * - Word accesses rather than byte accesses when copying matches
+ * - Faster Huffman decoding combined with various DEFLATE-specific tricks
+ * - Larger bitbuffer variable that doesn't need to be refilled as often
+ * - Other optimizations to remove unnecessary branches
+ * - Only full-buffer decompression is supported, so the code doesn't need to
+ *   support stopping and resuming decompression.
+ * - On x86_64, a version of the decompression routine is compiled with BMI2
+ *   instructions enabled and is used automatically at runtime when supported.
+ */
+
+#include "lib_common.h"
+#include "deflate_constants.h"
+
+/*
+ * If the expression passed to SAFETY_CHECK() evaluates to false, then the
+ * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
+ * compressed data is invalid.
+ *
+ * Theoretically, these checks could be disabled for specialized applications
+ * where all input to the decompressor will be trusted.
+ */
+#if 0
+#  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
+#  define SAFETY_CHECK(expr)	(void)(expr)
+#else
+#  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
+#endif
+
+/*****************************************************************************
+ *				Input bitstream                              *
+ *****************************************************************************/
+
+/*
+ * The state of the "input bitstream" consists of the following variables:
+ *
+ *	- in_next: a pointer to the next unread byte in the input buffer
+ *
+ *	- in_end: a pointer to just past the end of the input buffer
+ *
+ *	- bitbuf: a word-sized variable containing bits that have been read from
+ *		  the input buffer or from the implicit appended zero bytes
+ *
+ *	- bitsleft: the number of bits in 'bitbuf' available to be consumed.
+ *		    After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
+ *		    contain more bits than this.  However, only the bits counted
+ *		    by 'bitsleft' can actually be consumed; the rest can only be
+ *		    used for preloading.
+ *
+ *		    As a micro-optimization, we allow bits 8 and higher of
+ *		    'bitsleft' to contain garbage.  When consuming the bits
+ *		    associated with a decode table entry, this allows us to do
+ *		    'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
+ *		    On some CPUs, this helps reduce instruction dependencies.
+ *		    This does have the disadvantage that 'bitsleft' sometimes
+ *		    needs to be cast to 'u8', such as when it's used as a shift
+ *		    amount in REFILL_BITS_BRANCHLESS().  But that one happens
+ *		    for free since most CPUs ignore high bits in shift amounts.
+ *
+ *	- overread_count: the total number of implicit appended zero bytes that
+ *			  have been loaded into the bitbuffer, including any
+ *			  counted by 'bitsleft' and any already consumed
+ */
+
+/*
+ * The type for the bitbuffer variable ('bitbuf' described above).  For best
+ * performance, this should have size equal to a machine word.
+ *
+ * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
+ * which they don't have to refill as often.
+ */
+typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS	(8 * (int)sizeof(bitbuf_t))
+
+/* BITMASK(n) returns a bitmask of length 'n'. */
+#define BITMASK(n)	(((bitbuf_t)1 << (n)) - 1)
+
+/*
+ * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
+ * of '(u8)bitsleft'.  This is the size of the bitbuffer variable, minus 1 if
+ * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
+ */
+#define MAX_BITSLEFT	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
+
+/*
+ * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
+ * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
+ * Since only whole bytes can be added to 'bitsleft', the worst case is
+ * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
+ */
+#define CONSUMABLE_NBITS	(MAX_BITSLEFT - 7)
+
+/*
+ * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
+ * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP().  (It is *not*
+ * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
+ * byte-at-a-time refill method near the end of input.)  This may exceed the
+ * number of consumable bits (counted by 'bitsleft').  Any bits not counted in
+ * 'bitsleft' can only be used for precomputation and cannot be consumed.
+ */
+#define FASTLOOP_PRELOADABLE_NBITS	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
+
+/*
+ * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
+ * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
+ * subsequent consumptions.  This is 1 bit if the branchless refill method is
+ * being used, and 0 bits otherwise.
+ */
+#define PRELOAD_SLACK	MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
+
+/*
+ * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
+ * refilled, then it's always possible to consume 'n' bits from it.  'n' should
+ * be a compile-time constant, to enable compile-time evaluation.
+ */
+#define CAN_CONSUME(n)	(CONSUMABLE_NBITS >= (n))
+
+/*
+ * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
+ * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
+ * consume 'consume_nbits' bits, then preload 'preload_nbits' bits.  The
+ * arguments should be compile-time constants to enable compile-time evaluation.
+ */
+#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)	\
+	(CONSUMABLE_NBITS >= (consume_nbits) &&				\
+	 FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
+
+/*
+ * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
+ * reading the next word from the input buffer and updating 'in_next' and
+ * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
+ * This is much faster than reading a byte at a time, at least if the CPU is
+ * little endian and supports fast unaligned memory accesses.
+ *
+ * The simplest way of branchlessly updating 'bitsleft' would be:
+ *
+ *	bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
+ *
+ * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
+ * WORDBITS, so that in binary it looks like 111111 or 11111.  Then, we update
+ * 'bitsleft' by just setting the bits above the low 3 bits:
+ *
+ *	bitsleft |= MAX_BITSLEFT & ~7;
+ *
+ * That compiles down to a single instruction like 'or $0x38, %rbp'.  Using
+ * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
+ * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
+ *
+ * The simplest way of branchlessly updating 'in_next' would be:
+ *
+ *	in_next += (MAX_BITSLEFT - bitsleft) >> 3;
+ *
+ * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
+ * isn't really better:
+ *
+ *	in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
+ *
+ * An alternative which can be marginally better is the following:
+ *
+ *	in_next += sizeof(bitbuf_t) - 1;
+ *	in_next -= (bitsleft >> 3) & 0x7;
+ *
+ * It seems this would increase the number of CPU instructions from 3 (sub, shr,
+ * add) to 4 (add, shr, and, sub).  However, if the CPU has a bitfield
+ * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
+ * more efficient because the length of the longest dependency chain decreases
+ * from 3 to 2.  This alternative also has the advantage that it ignores the
+ * high bits in 'bitsleft', so it is compatible with the micro-optimization we
+ * use where we let the high bits of 'bitsleft' contain garbage.
+ */
+#define REFILL_BITS_BRANCHLESS()					\
+do {									\
+	bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;	\
+	in_next += sizeof(bitbuf_t) - 1;				\
+	in_next -= (bitsleft >> 3) & 0x7;				\
+	bitsleft |= MAX_BITSLEFT & ~7;					\
+} while (0)
+
+/*
+ * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
+ * contains at least CONSUMABLE_NBITS consumable bits.
+ *
+ * This checks for the end of input, and it doesn't guarantee
+ * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
+ *
+ * If we would overread the input buffer, we just don't read anything, leaving
+ * the bits zeroed but marking them filled.  This simplifies the decompressor
+ * because it removes the need to always be able to distinguish between real
+ * overreads and overreads caused only by the decompressor's own lookahead.
+ *
+ * We do still keep track of the number of bytes that have been overread, for
+ * two reasons.  First, it allows us to determine the exact number of bytes that
+ * were consumed once the stream ends or an uncompressed block is reached.
+ * Second, it allows us to stop early if the overread amount gets so large (more
+ * than sizeof bitbuf) that it can only be caused by a real overread.  (The
+ * second part is arguably unneeded, since libdeflate is buffer-based; given
+ * infinite zeroes, it will eventually either completely fill the output buffer
+ * or return an error.  However, we do it to be slightly more friendly to the
+ * not-recommended use case of decompressing with an unknown output size.)
+ */
+#define REFILL_BITS()							\
+do {									\
+	if (UNALIGNED_ACCESS_IS_FAST &&					\
+	    likely(in_end - in_next >= sizeof(bitbuf_t))) {		\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			if (likely(in_next != in_end)) {		\
+				bitbuf |= (bitbuf_t)*in_next++ <<	\
+					  (u8)bitsleft;			\
+			} else {					\
+				overread_count++;			\
+				SAFETY_CHECK(overread_count <=		\
+					     sizeof(bitbuf_t));		\
+			}						\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
+ * end of the input.  It can only be used in the fastloop.
+ */
+#define REFILL_BITS_IN_FASTLOOP()					\
+do {									\
+	STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||			\
+		      FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);	\
+	if (UNALIGNED_ACCESS_IS_FAST) {					\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft;	\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * This is the worst-case maximum number of output bytes that are written to
+ * during each iteration of the fastloop.  The worst case is 2 literals, then a
+ * match of length DEFLATE_MAX_MATCH_LEN.  Additionally, some slack space must
+ * be included for the intentional overrun in the match copy implementation.
+ */
+#define FASTLOOP_MAX_BYTES_WRITTEN	\
+	(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
+
+/*
+ * This is the worst-case maximum number of input bytes that are read during
+ * each iteration of the fastloop.  To get this value, we first compute the
+ * greatest number of bits that can be refilled during a loop iteration.  The
+ * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
+ * be refilled later is no more than the maximum amount that can be consumed by
+ * 2 literals that don't need a subtable, then a match.  We convert this value
+ * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
+ * can be advanced.  Finally, we add sizeof(bitbuf_t) to account for
+ * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
+ */
+#define FASTLOOP_MAX_BYTES_READ					\
+	(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) +	\
+		      LENGTH_MAXBITS + OFFSET_MAXBITS, 8) +	\
+	 sizeof(bitbuf_t))
+
+/*****************************************************************************
+ *                              Huffman decoding                             *
+ *****************************************************************************/
+
+/*
+ * The fastest way to decode Huffman-encoded data is basically to use a decode
+ * table that maps the next TABLEBITS bits of data to their symbol.  Each entry
+ * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'.  A
+ * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
+ *
+ * Ideally, TABLEBITS and the maximum codeword length would be the same; some
+ * compression formats are designed with this goal in mind.  Unfortunately, in
+ * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
+ * too large for a practical TABLEBITS.  It's not *that* much larger, though, so
+ * the workaround is to use a single level of subtables.  In the main table,
+ * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
+ * to the appropriate subtable along with the number of bits it is indexed with.
+ *
+ * The most efficient way to allocate subtables is to allocate them dynamically
+ * after the main table.  The worst-case number of table entries needed,
+ * including subtables, is precomputable; see the ENOUGH constants below.
+ *
+ * A useful optimization is to store the codeword lengths in the decode table so
+ * that they don't have to be looked up by indexing a separate table that maps
+ * symbols to their codeword lengths.  We basically do this; however, for the
+ * litlen and offset codes we also implement some DEFLATE-specific optimizations
+ * that build in the consideration of the "extra bits" and the
+ * literal/length/end-of-block division.  For the exact decode table entry
+ * format we use, see the definitions of the *_decode_results[] arrays below.
+ */
+
+
+/*
+ * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
+ * along with their corresponding ENOUGH values.
+ *
+ * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
+ * precode codeword length.  This avoids ever needing subtables.
+ *
+ * For the litlen and offset codes, we cannot realistically avoid ever needing
+ * subtables, since litlen and offset codewords can be up to 15 bits.  A higher
+ * TABLEBITS reduces the number of lookups that need a subtable, which increases
+ * performance; however, it increases memory usage and makes building the table
+ * take longer, which decreases performance.  We choose values that work well in
+ * practice, making subtables rarely needed without making the tables too large.
+ *
+ * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
+ * considerations, 9 would fit the trade-off curve better.  However, there is a
+ * performance benefit to using exactly 8 bits when it is a compile-time
+ * constant, as many CPUs can take the low byte more easily than the low 9 bits.
+ *
+ * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
+ * builds a table, it caps the actual table_bits to the longest codeword.  This
+ * makes sense in theory, as there's no need for the table to be any larger than
+ * needed to support the longest codeword.  However, having the table bits be a
+ * compile-time constant is beneficial to the performance of the decode loop, so
+ * there is a trade-off.  libdeflate currently uses the dynamic table_bits
+ * strategy for the litlen table only, due to its larger maximum size.
+ * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
+ * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
+ *
+ * Each TABLEBITS value has a corresponding ENOUGH value that gives the
+ * worst-case maximum number of decode table entries, including the main table
+ * and all subtables.  The ENOUGH value depends on three parameters:
+ *
+ *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ *	(2) the maximum number of main table bits (*_TABLEBITS)
+ *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH values were computed using the utility program 'enough' from zlib.
+ */
+#define PRECODE_TABLEBITS	7
+#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
+#define LITLEN_TABLEBITS	11
+#define LITLEN_ENOUGH		2342	/* enough 288 11 15	*/
+#define OFFSET_TABLEBITS	8
+#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
+
+/*
+ * make_decode_table_entry() creates a decode table entry for the given symbol
+ * by combining the static part 'decode_results[sym]' with the dynamic part
+ * 'len', which is the remaining codeword length (the codeword length for main
+ * table entries, or the codeword length minus TABLEBITS for subtable entries).
+ *
+ * In all cases, we add 'len' to each of the two low-order bytes to create the
+ * appropriately-formatted decode table entry.  See the definitions of the
+ * *_decode_results[] arrays below, where the entry format is described.
+ */
+static u32
+make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
+{
+	return decode_results[sym] + (len << 8) + len;
+}
+
+/*
+ * Here is the format of our precode decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Bit 20-16:  presym
+ *	Bit 10-8:   codeword length [not used]
+ *	Bit 2-0:    codeword length
+ *
+ * The precode decode table never has subtables, since we use
+ * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
+ *
+ * precode_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 precode_decode_results[] = {
+#define ENTRY(presym)	((u32)presym << 16)
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
+#undef ENTRY
+};
+
+/* Litlen and offset decode table entry flags */
+
+/* Indicates a literal entry in the litlen decode table */
+#define HUFFDEC_LITERAL			0x80000000
+
+/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
+#define HUFFDEC_EXCEPTIONAL		0x00008000
+
+/* Indicates a subtable pointer entry in the litlen or offset decode table */
+#define HUFFDEC_SUBTABLE_POINTER	0x00004000
+
+/* Indicates an end-of-block entry in the litlen decode table */
+#define HUFFDEC_END_OF_BLOCK		0x00002000
+
+/* Maximum number of bits that can be consumed by decoding a match length */
+#define LENGTH_MAXBITS		(DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXFASTBITS	(LITLEN_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+
+/*
+ * Here is the format of our litlen decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Literals:
+ *		Bit 31:     1 (HUFFDEC_LITERAL)
+ *		Bit 23-16:  literal value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Lengths:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 24-16:  length base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	End of block:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Subtable pointer:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 30-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * This format has several desirable properties:
+ *
+ *	- The codeword length, length slot base, and number of extra length bits
+ *	  are all built in.  This eliminates the need to separately look up this
+ *	  information by indexing separate arrays by symbol or length slot.
+ *
+ *	- The HUFFDEC_* flags enable easily distinguishing between the different
+ *	  types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
+ *	  literals; the high bit is used for this, as some CPUs can test the
+ *	  high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
+ *	  makes it possible to detect the two unlikely cases (subtable pointer
+ *	  and end of block) in a single bit flag test.
+ *
+ *	- The low byte is the number of bits that need to be removed from the
+ *	  bitstream; this makes this value easily accessible, and it enables the
+ *	  micro-optimization of doing 'bitsleft -= entry' instead of
+ *	  'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
+ *	  so they don't need to be removed separately.
+ *
+ *	- The flags in bits 15-13 are arranged to be 0 when the
+ *	  "remaining codeword length" in bits 11-8 is needed, making this value
+ *	  fairly easily accessible as well via a shift and downcast.
+ *
+ *	- Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
+ *	  needed, making it possible to extract this value with '& 0x3F' rather
+ *	  than '& 0xF'.  This value is only used as a shift amount, so this can
+ *	  save an 'and' instruction as the masking by 0x3F happens implicitly.
+ *
+ * litlen_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 litlen_decode_results[] = {
+
+	/* Literals */
+#define ENTRY(literal)	(HUFFDEC_LITERAL | ((u32)literal << 16))
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
+	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
+	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
+	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
+	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
+	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
+	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
+	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
+	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
+	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
+	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
+	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
+	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
+	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
+	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
+	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
+	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
+	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
+	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
+	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
+	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
+	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
+	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
+	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
+	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
+	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
+	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
+	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
+	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
+	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
+	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
+	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
+	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
+	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
+	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
+	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
+	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
+	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
+	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
+	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
+	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
+	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
+	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
+	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
+	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
+	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
+	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
+	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
+	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
+	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
+	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
+	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
+	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
+	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
+	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
+	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
+	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
+	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
+	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
+	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
+#undef ENTRY
+
+	/* End of block */
+	HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
+
+	/* Lengths */
+#define ENTRY(length_base, num_extra_bits)	\
+	(((u32)(length_base) << 16) | (num_extra_bits))
+	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
+	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
+	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
+	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
+	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
+	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
+	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
+	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
+#undef ENTRY
+};
+
+/* Maximum number of bits that can be consumed by decoding a match offset */
+#define OFFSET_MAXBITS		(DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXFASTBITS	(OFFSET_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+
+/*
+ * Here is the format of our offset decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Offsets:
+ *		Bit 31-16:  offset base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	Subtable pointer:
+ *		Bit 31-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * These work the same way as the length entries and subtable pointer entries in
+ * the litlen decode table; see litlen_decode_results[] above.
+ */
+static const u32 offset_decode_results[] = {
+#define ENTRY(offset_base, num_extra_bits)	\
+	(((u32)(offset_base) << 16) | (num_extra_bits))
+	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
+	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
+	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
+	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
+	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
+	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
+	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
+	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
+#undef ENTRY
+};
+
+/*
+ * The main DEFLATE decompressor structure.  Since libdeflate only supports
+ * full-buffer decompression, this structure doesn't store the entire
+ * decompression state, most of which is in stack variables.  Instead, this
+ * struct just contains the decode tables and some temporary arrays used for
+ * building them, as these are too large to comfortably allocate on the stack.
+ *
+ * Storing the decode tables in the decompressor struct also allows the decode
+ * tables for the static codes to be reused whenever two static Huffman blocks
+ * are decoded without an intervening dynamic block, even across streams.
+ */
+struct libdeflate_decompressor {
+
+	/*
+	 * The arrays aren't all needed at the same time.  'precode_lens' and
+	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
+	 * Furthermore, 'lens' need not be retained after building the litlen
+	 * and offset decode tables.  In fact, 'lens' can be in union with
+	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+	 * and is built first.
+	 */
+
+	union {
+		u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+		struct {
+			u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+				DEFLATE_NUM_OFFSET_SYMS +
+				DEFLATE_MAX_LENS_OVERRUN];
+
+			u32 precode_decode_table[PRECODE_ENOUGH];
+		} l;
+
+		u32 litlen_decode_table[LITLEN_ENOUGH];
+	} u;
+
+	u32 offset_decode_table[OFFSET_ENOUGH];
+
+	/* used only during build_decode_table() */
+	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+
+	bool static_codes_loaded;
+	unsigned litlen_tablebits;
+};
+
+/*
+ * Build a table for fast decoding of symbols from a Huffman code.  As input,
+ * this function takes the codeword length of each symbol which may be used in
+ * the code.  As output, it produces a decode table for the canonical Huffman
+ * code described by the codeword lengths.  The decode table is built with the
+ * assumption that it will be indexed with "bit-reversed" codewords, where the
+ * low-order bit is the first bit of the codeword.  This format is used for all
+ * Huffman codes in DEFLATE.
+ *
+ * @decode_table
+ *	The array in which the decode table will be generated.  This array must
+ *	have sufficient length; see the definition of the ENOUGH numbers.
+ * @lens
+ *	An array which provides, for each symbol, the length of the
+ *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
+ *	alias @decode_table, since nothing is written to @decode_table until all
+ *	@lens have been consumed.  All codeword lengths are assumed to be <=
+ *	@max_codeword_len but are otherwise considered untrusted.  If they do
+ *	not form a valid Huffman code, then the decode table is not built and
+ *	%false is returned.
+ * @num_syms
+ *	The number of symbols in the code, including all unused symbols.
+ * @decode_results
+ *	An array which gives the incomplete decode result for each symbol.  The
+ *	needed values in this array will be combined with codeword lengths to
+ *	make the final decode table entries using make_decode_table_entry().
+ * @table_bits
+ *	The log base-2 of the number of main table entries to use.
+ *	If @table_bits_ret != NULL, then @table_bits is treated as a maximum
+ *	value and it will be decreased if a smaller table would be sufficient.
+ * @max_codeword_len
+ *	The maximum allowed codeword length for this Huffman code.
+ *	Must be <= DEFLATE_MAX_CODEWORD_LEN.
+ * @sorted_syms
+ *	A temporary array of length @num_syms.
+ * @table_bits_ret
+ *	If non-NULL, then the dynamic table_bits is enabled, and the actual
+ *	table_bits value will be returned here.
+ *
+ * Returns %true if successful; %false if the codeword lengths do not form a
+ * valid Huffman code.
+ */
+static bool
+build_decode_table(u32 decode_table[],
+		   const u8 lens[],
+		   const unsigned num_syms,
+		   const u32 decode_results[],
+		   unsigned table_bits,
+		   unsigned max_codeword_len,
+		   u16 *sorted_syms,
+		   unsigned *table_bits_ret)
+{
+	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned sym;		/* current symbol */
+	unsigned codeword;	/* current codeword, bit-reversed */
+	unsigned len;		/* current codeword length in bits */
+	unsigned count;		/* num codewords remaining with this length */
+	u32 codespace_used;	/* codespace used out of '2^max_codeword_len' */
+	unsigned cur_table_end; /* end index of current table */
+	unsigned subtable_prefix; /* codeword prefix of current subtable */
+	unsigned subtable_start;  /* start index of current subtable */
+	unsigned subtable_bits;   /* log2 of current subtable length */
+
+	/* Count how many codewords have each length, including 0. */
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	for (sym = 0; sym < num_syms; sym++)
+		len_counts[lens[sym]]++;
+
+	/*
+	 * Determine the actual maximum codeword length that was used, and
+	 * decrease table_bits to it if allowed.
+	 */
+	while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
+		max_codeword_len--;
+	if (table_bits_ret != NULL) {
+		table_bits = MIN(table_bits, max_codeword_len);
+		*table_bits_ret = table_bits;
+	}
+
+	/*
+	 * Sort the symbols primarily by increasing codeword length and
+	 * secondarily by increasing symbol value; or equivalently by their
+	 * codewords in lexicographic order, since a canonical code is assumed.
+	 *
+	 * For efficiency, also compute 'codespace_used' in the same pass over
+	 * 'len_counts[]' used to build 'offsets[]' for sorting.
+	 */
+
+	/* Ensure that 'codespace_used' cannot overflow. */
+	STATIC_ASSERT(sizeof(codespace_used) == 4);
+	STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
+		      DEFLATE_MAX_NUM_SYMS);
+
+	offsets[0] = 0;
+	offsets[1] = len_counts[0];
+	codespace_used = 0;
+	for (len = 1; len < max_codeword_len; len++) {
+		offsets[len + 1] = offsets[len] + len_counts[len];
+		codespace_used = (codespace_used << 1) + len_counts[len];
+	}
+	codespace_used = (codespace_used << 1) + len_counts[len];
+
+	for (sym = 0; sym < num_syms; sym++)
+		sorted_syms[offsets[lens[sym]]++] = sym;
+
+	sorted_syms += offsets[0]; /* Skip unused symbols */
+
+	/* lens[] is done being used, so we can write to decode_table[] now. */
+
+	/*
+	 * Check whether the lengths form a complete code (exactly fills the
+	 * codespace), an incomplete code (doesn't fill the codespace), or an
+	 * overfull code (overflows the codespace).  A codeword of length 'n'
+	 * uses proportion '1/(2^n)' of the codespace.  An overfull code is
+	 * nonsensical, so is considered invalid.  An incomplete code is
+	 * considered valid only in two specific cases; see below.
+	 */
+
+	/* overfull code? */
+	if (unlikely(codespace_used > (1U << max_codeword_len)))
+		return false;
+
+	/* incomplete code? */
+	if (unlikely(codespace_used < (1U << max_codeword_len))) {
+		u32 entry;
+		unsigned i;
+
+		/*
+		 * The DEFLATE RFC explicitly allows the offset code to be
+		 * incomplete in two cases: a code containing just 1 codeword,
+		 * if that codeword has length 1; and a code containing no
+		 * codewords.  Note: the list of offset codeword lengths is
+		 * always nonempty, but lengths of 0 don't count as codewords.
+		 *
+		 * The RFC doesn't say whether the same cases are allowed for
+		 * the litlen and pre codes.  It's actually impossible for no
+		 * symbols to be used from these codes; however, it's
+		 * technically possible for only one symbol to be used.  zlib
+		 * allows 1 codeword for the litlen code, but not the precode.
+		 * The RFC also doesn't say whether, when there is 1 codeword,
+		 * that codeword is '0' or '1'.  zlib uses '0'.
+		 *
+		 * We accept what zlib accepts, plus a bit more.  First, we
+		 * don't treat the precode more strictly than the litlen and
+		 * offset codes.  There's no convincing reason to add a special
+		 * case for the precode here.
+		 *
+		 * Second, we just map each allowed incompete code to a complete
+		 * code with only real symbols.  To do this, we choose a symbol,
+		 * either the used symbol (for codes with 1 codeword) or an
+		 * arbitrary symbol (for empty codes), and give it both
+		 * codewords '0' and '1'.  zlib instead uses a special ERROR
+		 * symbol in the part of the codespace the code doesn't use.
+		 * However, having an ERROR symbol reduces the performance of
+		 * the Huffman decoder, for no real benefit.  Our approach also
+		 * avoids having to decide whether '0' or '1' is correct.
+		 *
+		 * Like zlib, we still reject all incomplete codes that contain
+		 * more than 1 codeword or a codeword length greater than 1.
+		 */
+		if (codespace_used == 0) {
+			sym = 0; /* arbitrary */
+		} else {
+			if (codespace_used != (1U << (max_codeword_len - 1)) ||
+			    len_counts[1] != 1)
+				return false;
+			sym = sorted_syms[0];
+		}
+		entry = make_decode_table_entry(decode_results, sym, 1);
+		for (i = 0; i < (1U << table_bits); i++)
+			decode_table[i] = entry;
+		return true;
+	}
+
+	/*
+	 * The lengths form a complete code.  Now, enumerate the codewords in
+	 * lexicographic order and fill the decode table entries for each one.
+	 *
+	 * First, process all codewords with len <= table_bits.  Each one gets
+	 * '2^(table_bits-len)' direct entries in the table.
+	 *
+	 * Since DEFLATE uses bit-reversed codewords, these entries aren't
+	 * consecutive but rather are spaced '2^len' entries apart.  This makes
+	 * filling them naively somewhat awkward and inefficient, since strided
+	 * stores are less cache-friendly and preclude the use of word or
+	 * vector-at-a-time stores to fill multiple entries per instruction.
+	 *
+	 * To optimize this, we incrementally double the table size.  When
+	 * processing codewords with length 'len', the table is treated as
+	 * having only '2^len' entries, so each codeword uses just one entry.
+	 * Then, each time 'len' is incremented, the table size is doubled and
+	 * the first half is copied to the second half.  This significantly
+	 * improves performance over naively doing strided stores.
+	 *
+	 * Note that some entries copied for each table doubling may not have
+	 * been initialized yet, but it doesn't matter since they're guaranteed
+	 * to be initialized later (because the Huffman code is complete).
+	 */
+	codeword = 0;
+	len = 1;
+	while ((count = len_counts[len]) == 0)
+		len++;
+	cur_table_end = 1U << len;
+	while (len <= table_bits) {
+		/* Process all 'count' codewords with length 'len' bits. */
+		do {
+			unsigned bit;
+
+			/* Fill the first entry for the current codeword. */
+			decode_table[codeword] =
+				make_decode_table_entry(decode_results,
+							*sorted_syms++, len);
+
+			if (codeword == cur_table_end - 1) {
+				/* Last codeword (all 1's) */
+				for (; len < table_bits; len++) {
+					__builtin_memcpy(&decode_table[cur_table_end],
+					       decode_table,
+					       cur_table_end *
+						sizeof(decode_table[0]));
+					cur_table_end <<= 1;
+				}
+				return true;
+			}
+			/*
+			 * To advance to the lexicographically next codeword in
+			 * the canonical code, the codeword must be incremented,
+			 * then 0's must be appended to the codeword as needed
+			 * to match the next codeword's length.
+			 *
+			 * Since the codeword is bit-reversed, appending 0's is
+			 * a no-op.  However, incrementing it is nontrivial.  To
+			 * do so efficiently, use the 'bsr' instruction to find
+			 * the last (highest order) 0 bit in the codeword, set
+			 * it, and clear any later (higher order) 1 bits.  But
+			 * 'bsr' actually finds the highest order 1 bit, so to
+			 * use it first flip all bits in the codeword by XOR'ing
+			 * it with (1U << len) - 1 == cur_table_end - 1.
+			 */
+			bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
+			codeword &= bit - 1;
+			codeword |= bit;
+		} while (--count);
+
+		/* Advance to the next codeword length. */
+		do {
+			if (++len <= table_bits) {
+				__builtin_memcpy(&decode_table[cur_table_end],
+				       decode_table,
+				       cur_table_end * sizeof(decode_table[0]));
+				cur_table_end <<= 1;
+			}
+		} while ((count = len_counts[len]) == 0);
+	}
+
+	/* Process codewords with len > table_bits.  These require subtables. */
+	cur_table_end = 1U << table_bits;
+	subtable_prefix = -1;
+	subtable_start = 0;
+	for (;;) {
+		u32 entry;
+		unsigned i;
+		unsigned stride;
+		unsigned bit;
+
+		/*
+		 * Start a new subtable if the first 'table_bits' bits of the
+		 * codeword don't match the prefix of the current subtable.
+		 */
+		if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
+			subtable_prefix = (codeword & ((1U << table_bits) - 1));
+			subtable_start = cur_table_end;
+			/*
+			 * Calculate the subtable length.  If the codeword has
+			 * length 'table_bits + n', then the subtable needs
+			 * '2^n' entries.  But it may need more; if fewer than
+			 * '2^n' codewords of length 'table_bits + n' remain,
+			 * then the length will need to be incremented to bring
+			 * in longer codewords until the subtable can be
+			 * completely filled.  Note that because the Huffman
+			 * code is complete, it will always be possible to fill
+			 * the subtable eventually.
+			 */
+			subtable_bits = len - table_bits;
+			codespace_used = count;
+			while (codespace_used < (1U << subtable_bits)) {
+				subtable_bits++;
+				codespace_used = (codespace_used << 1) +
+					len_counts[table_bits + subtable_bits];
+			}
+			cur_table_end = subtable_start + (1U << subtable_bits);
+
+			/*
+			 * Create the entry that points from the main table to
+			 * the subtable.
+			 */
+			decode_table[subtable_prefix] =
+				((u32)subtable_start << 16) |
+				HUFFDEC_EXCEPTIONAL |
+				HUFFDEC_SUBTABLE_POINTER |
+				(subtable_bits << 8) | table_bits;
+		}
+
+		/* Fill the subtable entries for the current codeword. */
+		entry = make_decode_table_entry(decode_results, *sorted_syms++,
+						len - table_bits);
+		i = subtable_start + (codeword >> table_bits);
+		stride = 1U << (len - table_bits);
+		do {
+			decode_table[i] = entry;
+			i += stride;
+		} while (i < cur_table_end);
+
+		/* Advance to the next codeword. */
+		if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
+			return true;
+		bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
+		codeword &= bit - 1;
+		codeword |= bit;
+		count--;
+		while (count == 0)
+			count = len_counts[++len];
+	}
+}
+
+/* Build the decode table for the precode.  */
+static bool
+build_precode_decode_table(struct libdeflate_decompressor *d)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+
+	STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
+		      DEFLATE_NUM_PRECODE_SYMS);
+
+	return build_decode_table(d->u.l.precode_decode_table,
+				  d->u.precode_lens,
+				  DEFLATE_NUM_PRECODE_SYMS,
+				  precode_decode_results,
+				  PRECODE_TABLEBITS,
+				  DEFLATE_MAX_PRE_CODEWORD_LEN,
+				  d->sorted_syms,
+				  NULL);
+}
+
+/* Build the decode table for the literal/length code.  */
+static bool
+build_litlen_decode_table(struct libdeflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
+
+	STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
+		      DEFLATE_NUM_LITLEN_SYMS);
+
+	return build_decode_table(d->u.litlen_decode_table,
+				  d->u.l.lens,
+				  num_litlen_syms,
+				  litlen_decode_results,
+				  LITLEN_TABLEBITS,
+				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+				  d->sorted_syms,
+				  &d->litlen_tablebits);
+}
+
+/* Build the decode table for the offset code.  */
+static bool
+build_offset_decode_table(struct libdeflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+
+	STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
+		      DEFLATE_NUM_OFFSET_SYMS);
+
+	return build_decode_table(d->offset_decode_table,
+				  d->u.l.lens + num_litlen_syms,
+				  num_offset_syms,
+				  offset_decode_results,
+				  OFFSET_TABLEBITS,
+				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
+				  d->sorted_syms,
+				  NULL);
+}
+
+/*****************************************************************************
+ *                         Main decompression routine
+ *****************************************************************************/
+
+typedef enum libdeflate_result (*decompress_func_t)
+	(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+#define FUNCNAME deflate_decompress_default
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
+#include "decompress_template.h"
+
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_decompress_func
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#  include "x86/decompress_impl.h"
+#endif
+
+#ifndef DEFAULT_IMPL
+#  define DEFAULT_IMPL deflate_decompress_default
+#endif
+
+#ifdef arch_select_decompress_func
+static enum libdeflate_result
+dispatch_decomp(struct libdeflate_decompressor *d,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes_avail,
+		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+static volatile decompress_func_t decompress_impl = dispatch_decomp;
+
+/* Choose the best implementation at runtime. */
+static enum libdeflate_result
+dispatch_decomp(struct libdeflate_decompressor *d,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes_avail,
+		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	decompress_func_t f = arch_select_decompress_func();
+
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+
+	decompress_impl = f;
+	return f(d, in, in_nbytes, out, out_nbytes_avail,
+		 actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#  define decompress_impl DEFAULT_IMPL
+#endif
+
+/*
+ * This is the main DEFLATE decompression routine.  See libdeflate.h for the
+ * documentation.
+ *
+ * Note that the real code is in decompress_template.h.  The part here just
+ * handles calling the appropriate implementation depending on the CPU features
+ * at runtime.
+ */
+enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret)
+{
+	return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
+			       actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+
+enum libdeflate_result
+libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_out_nbytes_ret)
+{
+	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
+						out, out_nbytes_avail,
+						NULL, actual_out_nbytes_ret);
+}
+
+struct libdeflate_decompressor *
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options)
+{
+	struct libdeflate_decompressor *d;
+
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
+	d = __malloc(sizeof(*d));
+	if (d == NULL)
+		return NULL;
+	/*
+	 * Note that only certain parts of the decompressor actually must be
+	 * initialized here:
+	 *
+	 * - 'static_codes_loaded' must be initialized to false.
+	 *
+	 * - The first half of the main portion of each decode table must be
+	 *   initialized to any value, to avoid reading from uninitialized
+	 *   memory during table expansion in build_decode_table().  (Although,
+	 *   this is really just to avoid warnings with dynamic tools like
+	 *   valgrind, since build_decode_table() is guaranteed to initialize
+	 *   all entries eventually anyway.)
+	 *
+	 * But for simplicity, we currently just zero the whole decompressor.
+	 */
+	__builtin_memset(d, 0, sizeof(*d));
+	return d;
+}
+
+struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void)
+{
+	static const struct libdeflate_options defaults = {
+		.sizeof_options = sizeof(defaults),
+	};
+	return libdeflate_alloc_decompressor_ex(&defaults);
+}
+
+void
+libdeflate_free_decompressor(struct libdeflate_decompressor *d)
+{
+	if (d)
+		__free(d);
+}
diff --git a/packages/wasm/lib/libdeflate/deflate_decompress.h b/packages/wasm/lib/libdeflate/deflate_decompress.h
new file mode 100644
index 00000000..754fdba7
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/deflate_decompress.h
@@ -0,0 +1,14 @@
+#ifndef LIB_DEFLATE_COMPRESS_H
+#define LIB_DEFLATE_COMPRESS_H
+
+#include "lib_common.h"
+
+enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret);
+
+
+#endif /* LIB_DEFLATE_COMPRESS_H */
diff --git a/packages/wasm/lib/libdeflate/gzip_constants.h b/packages/wasm/lib/libdeflate/gzip_constants.h
new file mode 100644
index 00000000..35e4728d
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/gzip_constants.h
@@ -0,0 +1,45 @@
+/*
+ * gzip_constants.h - constants for the gzip wrapper format
+ */
+
+#ifndef LIB_GZIP_CONSTANTS_H
+#define LIB_GZIP_CONSTANTS_H
+
+#define GZIP_MIN_HEADER_SIZE	10
+#define GZIP_FOOTER_SIZE	8
+#define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
+
+#define GZIP_ID1		0x1F
+#define GZIP_ID2		0x8B
+
+#define GZIP_CM_DEFLATE		8
+
+#define GZIP_FTEXT		0x01
+#define GZIP_FHCRC		0x02
+#define GZIP_FEXTRA		0x04
+#define GZIP_FNAME		0x08
+#define GZIP_FCOMMENT		0x10
+#define GZIP_FRESERVED		0xE0
+
+#define GZIP_MTIME_UNAVAILABLE	0
+
+#define GZIP_XFL_SLOWEST_COMPRESSION	0x02
+#define GZIP_XFL_FASTEST_COMPRESSION	0x04
+
+#define GZIP_OS_FAT		0
+#define GZIP_OS_AMIGA		1
+#define GZIP_OS_VMS		2
+#define GZIP_OS_UNIX		3
+#define GZIP_OS_VM_CMS		4
+#define GZIP_OS_ATARI_TOS	5
+#define GZIP_OS_HPFS		6
+#define GZIP_OS_MACINTOSH	7
+#define GZIP_OS_Z_SYSTEM	8
+#define GZIP_OS_CP_M		9
+#define GZIP_OS_TOPS_20		10
+#define GZIP_OS_NTFS		11
+#define GZIP_OS_QDOS		12
+#define GZIP_OS_RISCOS		13
+#define GZIP_OS_UNKNOWN		255
+
+#endif /* LIB_GZIP_CONSTANTS_H */
diff --git a/packages/wasm/lib/libdeflate/gzip_decompress.c b/packages/wasm/lib/libdeflate/gzip_decompress.c
new file mode 100644
index 00000000..3a4fabe6
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/gzip_decompress.c
@@ -0,0 +1,160 @@
+/*
+ * gzip_decompress.c - decompress with a gzip wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+#include "gzip_constants.h"
+#ifdef CRC32
+#include "crc32.h"
+#endif
+
+enum libdeflate_result
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	u8 flg;
+	size_t actual_in_nbytes;
+	size_t actual_out_nbytes;
+	enum libdeflate_result result;
+
+	if (in_nbytes < GZIP_MIN_OVERHEAD)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* ID1 */
+	if (*in_next++ != GZIP_ID1)
+		return LIBDEFLATE_BAD_DATA;
+	/* ID2 */
+	if (*in_next++ != GZIP_ID2)
+		return LIBDEFLATE_BAD_DATA;
+	/* CM */
+	if (*in_next++ != GZIP_CM_DEFLATE)
+		return LIBDEFLATE_BAD_DATA;
+	flg = *in_next++;
+	/* MTIME */
+	in_next += 4;
+	/* XFL */
+	in_next += 1;
+	/* OS */
+	in_next += 1;
+
+	if (flg & GZIP_FRESERVED)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* Extra field */
+	if (flg & GZIP_FEXTRA) {
+		u16 xlen = get_unaligned_le16(in_next);
+		in_next += 2;
+
+		if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+
+		in_next += xlen;
+	}
+
+	/* Original file name (zero terminated) */
+	if (flg & GZIP_FNAME) {
+		while (*in_next++ != 0 && in_next != in_end)
+			;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+	}
+
+	/* File comment (zero terminated) */
+	if (flg & GZIP_FCOMMENT) {
+		while (*in_next++ != 0 && in_next != in_end)
+			;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+	}
+
+	/* CRC16 for gzip header */
+	if (flg & GZIP_FHCRC) {
+		in_next += 2;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+	}
+
+	/* Compressed data  */
+	result = libdeflate_deflate_decompress_ex(d, in_next,
+					in_end - GZIP_FOOTER_SIZE - in_next,
+					out, out_nbytes_avail,
+					&actual_in_nbytes,
+					actual_out_nbytes_ret);
+	if (result != LIBDEFLATE_SUCCESS)
+		return result;
+
+	if (actual_out_nbytes_ret)
+		actual_out_nbytes = *actual_out_nbytes_ret;
+	else
+		actual_out_nbytes = out_nbytes_avail;
+
+	in_next += actual_in_nbytes;
+
+	/* CRC32 */
+	#ifdef CRC32
+	// this library is supposed to be used for MTProto
+	// there's no need to check for CRC32, since the data is guaranteed to be correct
+	// by the protocol itself. not including crc32 implementation allows us to
+	// save around 8kb of code size
+	if (libdeflate_crc32(0, out, actual_out_nbytes) !=
+	    get_unaligned_le32(in_next))
+		return LIBDEFLATE_BAD_DATA;
+	#endif
+	in_next += 4;
+
+	/* ISIZE */
+	if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
+		return LIBDEFLATE_BAD_DATA;
+	in_next += 4;
+
+	if (actual_in_nbytes_ret)
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+
+	return LIBDEFLATE_SUCCESS;
+}
+
+LIBDEFLATEAPI int32_t
+libdeflate_gzip_get_output_size(const void* in, size_t in_nbytes) {
+	return get_unaligned_le32((u8*)in + in_nbytes - 4);
+}
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail)
+{
+	// we're using `libdeflate_zlib_get_output_size` to allocate exactly the
+	// right amount of memory for the output buffer, so this is redundant
+	size_t actual_out_nbytes_ret;
+	return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
+					     out, out_nbytes_avail,
+					     NULL, &actual_out_nbytes_ret);
+}
diff --git a/packages/wasm/lib/libdeflate/hc_matchfinder.h b/packages/wasm/lib/libdeflate/hc_matchfinder.h
new file mode 100644
index 00000000..edf4e277
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/hc_matchfinder.h
@@ -0,0 +1,401 @@
+/*
+ * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ *				   Algorithm
+ *
+ * This is a Hash Chains (hc) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * linked list (or "chain") of sequences whose first 4 bytes share the same hash
+ * code.  Each sequence is identified by its starting position in the input
+ * buffer.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, this hash
+ * bucket's linked list is searched for matches.  Then, a new linked list node
+ * is created to represent the current sequence and is prepended to the list.
+ *
+ * This algorithm has several useful properties:
+ *
+ * - It only finds true Lempel-Ziv matches; i.e., those where the matching
+ *   sequence occurs prior to the sequence being matched against.
+ *
+ * - The sequences in each linked list are always sorted by decreasing starting
+ *   position.  Therefore, the closest (smallest offset) matches are found
+ *   first, which in many compression formats tend to be the cheapest to encode.
+ *
+ * - Although fast running time is not guaranteed due to the possibility of the
+ *   lists getting very long, the worst degenerate behavior can be easily
+ *   prevented by capping the number of nodes searched at each position.
+ *
+ * - If the compressor decides not to search for matches at a certain position,
+ *   then that position can be quickly inserted without searching the list.
+ *
+ * - The algorithm is adaptable to sliding windows: just store the positions
+ *   relative to a "base" value that is updated from time to time, and stop
+ *   searching each list when the sequences get too far away.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ *				 Optimizations
+ *
+ * The main hash table and chains handle length 4+ matches.  Length 3 matches
+ * are handled by a separate hash table with no chains.  This works well for
+ * typical "greedy" or "lazy"-style compressors, where length 3 matches are
+ * often only helpful if they have small offsets.  Instead of searching a full
+ * chain for length 3+ matches, the algorithm just checks for one close length 3
+ * match, then focuses on finding length 4+ matches.
+ *
+ * The longest_match() and skip_bytes() functions are inlined into the
+ * compressors that use them.  This isn't just about saving the overhead of a
+ * function call.  These functions are intended to be called from the inner
+ * loops of compressors, where giving the compiler more control over register
+ * allocation is very helpful.  There is also significant benefit to be gained
+ * from allowing the CPU to predict branches independently at each call site.
+ * For example, "lazy"-style compressors can be written with two calls to
+ * longest_match(), each of which starts with a different 'best_len' and
+ * therefore has significantly different performance characteristics.
+ *
+ * Although any hash function can be used, a multiplicative hash is fast and
+ * works well.
+ *
+ * On some processors, it is significantly faster to extend matches by whole
+ * words (32 or 64 bits) instead of by individual bytes.  For this to be the
+ * case, the processor must implement unaligned memory accesses efficiently and
+ * must have either a fast "find first set bit" instruction or a fast "find last
+ * set bit" instruction, depending on the processor's endianness.
+ *
+ * The code uses one loop for finding the first match and one loop for finding a
+ * longer match.  Each of these loops is tuned for its respective task and in
+ * combination are faster than a single generalized loop that handles both
+ * tasks.
+ *
+ * The code also uses a tight inner loop that only compares the last and first
+ * bytes of a potential match.  It is only when these bytes match that a full
+ * match extension is attempted.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#ifndef LIB_HC_MATCHFINDER_H
+#define LIB_HC_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define HC_MATCHFINDER_HASH3_ORDER	15
+#define HC_MATCHFINDER_HASH4_ORDER	16
+
+#define HC_MATCHFINDER_TOTAL_HASH_SIZE			\
+	(((1UL << HC_MATCHFINDER_HASH3_ORDER) +		\
+	  (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
+
+struct MATCHFINDER_ALIGNED hc_matchfinder  {
+
+	/* The hash table for finding length 3 matches  */
+	mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
+
+	/* The hash table which contains the first nodes of the linked lists for
+	 * finding length 4+ matches  */
+	mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
+
+	/* The "next node" references for the linked lists.  The "next node" of
+	 * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
+	mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
+};
+
+/* Prepare the matchfinder for a new input buffer.  */
+static void
+hc_matchfinder_init(struct hc_matchfinder *mf)
+{
+	STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE %
+		      MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE);
+}
+
+static void
+hc_matchfinder_slide_window(struct hc_matchfinder *mf)
+{
+	STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+/*
+ * Find the longest match longer than 'best_len' bytes.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base_p
+ *	Location of a pointer which points to the place in the input data the
+ *	matchfinder currently stores positions relative to.  This may be updated
+ *	by this function.
+ * @in_next
+ *	Pointer to the next position in the input buffer, i.e. the sequence
+ *	being matched against.
+ * @best_len
+ *	Require a match longer than this length.
+ * @max_len
+ *	The maximum permissible match length at this position.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + 1.
+ * @offset_ret
+ *	If a match is found, its offset is returned in this location.
+ *
+ * Return the length of the match found, or 'best_len' if no match longer than
+ * 'best_len' was found.
+ */
+static u32
+hc_matchfinder_longest_match(struct hc_matchfinder * const mf,
+			     const u8 ** const in_base_p,
+			     const u8 * const in_next,
+			     u32 best_len,
+			     const u32 max_len,
+			     const u32 nice_len,
+			     const u32 max_search_depth,
+			     u32 * const next_hashes,
+			     u32 * const offset_ret)
+{
+	u32 depth_remaining = max_search_depth;
+	const u8 *best_matchptr = in_next;
+	mf_pos_t cur_node3, cur_node4;
+	u32 hash3, hash4;
+	u32 next_hashseq;
+	u32 seq4;
+	const u8 *matchptr;
+	u32 len;
+	u32 cur_pos = in_next - *in_base_p;
+	const u8 *in_base;
+	mf_pos_t cutoff;
+
+	if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+		hc_matchfinder_slide_window(mf);
+		*in_base_p += MATCHFINDER_WINDOW_SIZE;
+		cur_pos = 0;
+	}
+
+	in_base = *in_base_p;
+	cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+
+	if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
+		goto out;
+
+	/* Get the precomputed hash codes.  */
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+
+	/* From the hash buckets, get the first node of each linked list.  */
+	cur_node3 = mf->hash3_tab[hash3];
+	cur_node4 = mf->hash4_tab[hash4];
+
+	/* Update for length 3 matches.  This replaces the singleton node in the
+	 * 'hash3' bucket with the node for the current sequence.  */
+	mf->hash3_tab[hash3] = cur_pos;
+
+	/* Update for length 4 matches.  This prepends the node for the current
+	 * sequence to the linked list in the 'hash4' bucket.  */
+	mf->hash4_tab[hash4] = cur_pos;
+	mf->next_tab[cur_pos] = cur_node4;
+
+	/* Compute the next hash codes.  */
+	next_hashseq = get_unaligned_le32(in_next + 1);
+	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+	prefetchw(&mf->hash3_tab[next_hashes[0]]);
+	prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+	if (best_len < 4) {  /* No match of length >= 4 found yet?  */
+
+		/* Check for a length 3 match if needed.  */
+
+		if (cur_node3 <= cutoff)
+			goto out;
+
+		seq4 = load_u32_unaligned(in_next);
+
+		if (best_len < 3) {
+			matchptr = &in_base[cur_node3];
+			if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
+				best_len = 3;
+				best_matchptr = matchptr;
+			}
+		}
+
+		/* Check for a length 4 match.  */
+
+		if (cur_node4 <= cutoff)
+			goto out;
+
+		for (;;) {
+			/* No length 4 match found yet.  Check the first 4 bytes.  */
+			matchptr = &in_base[cur_node4];
+
+			if (load_u32_unaligned(matchptr) == seq4)
+				break;
+
+			/* The first 4 bytes did not match.  Keep trying.  */
+			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+			if (cur_node4 <= cutoff || !--depth_remaining)
+				goto out;
+		}
+
+		/* Found a match of length >= 4.  Extend it to its full length.  */
+		best_matchptr = matchptr;
+		best_len = lz_extend(in_next, best_matchptr, 4, max_len);
+		if (best_len >= nice_len)
+			goto out;
+		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+		if (cur_node4 <= cutoff || !--depth_remaining)
+			goto out;
+	} else {
+		if (cur_node4 <= cutoff || best_len >= nice_len)
+			goto out;
+	}
+
+	/* Check for matches of length >= 5.  */
+
+	for (;;) {
+		for (;;) {
+			matchptr = &in_base[cur_node4];
+
+			/* Already found a length 4 match.  Try for a longer
+			 * match; start by checking either the last 4 bytes and
+			 * the first 4 bytes, or the last byte.  (The last byte,
+			 * the one which would extend the match length by 1, is
+			 * the most important.)  */
+		#if UNALIGNED_ACCESS_IS_FAST
+			if ((load_u32_unaligned(matchptr + best_len - 3) ==
+			     load_u32_unaligned(in_next + best_len - 3)) &&
+			    (load_u32_unaligned(matchptr) ==
+			     load_u32_unaligned(in_next)))
+		#else
+			if (matchptr[best_len] == in_next[best_len])
+		#endif
+				break;
+
+			/* Continue to the next node in the list.  */
+			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+			if (cur_node4 <= cutoff || !--depth_remaining)
+				goto out;
+		}
+
+	#if UNALIGNED_ACCESS_IS_FAST
+		len = 4;
+	#else
+		len = 0;
+	#endif
+		len = lz_extend(in_next, matchptr, len, max_len);
+		if (len > best_len) {
+			/* This is the new longest match.  */
+			best_len = len;
+			best_matchptr = matchptr;
+			if (best_len >= nice_len)
+				goto out;
+		}
+
+		/* Continue to the next node in the list.  */
+		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+		if (cur_node4 <= cutoff || !--depth_remaining)
+			goto out;
+	}
+out:
+	*offset_ret = in_next - best_matchptr;
+	return best_len;
+}
+
+/*
+ * Advance the matchfinder, but don't search for matches.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base_p
+ *	Location of a pointer which points to the place in the input data the
+ *	matchfinder currently stores positions relative to.  This may be updated
+ *	by this function.
+ * @in_next
+ *	Pointer to the next position in the input buffer.
+ * @in_end
+ *	Pointer to the end of the input buffer.
+ * @count
+ *	The number of bytes to advance.  Must be > 0.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + @count.
+ */
+static void
+hc_matchfinder_skip_bytes(struct hc_matchfinder * const mf,
+			  const u8 ** const in_base_p,
+			  const u8 *in_next,
+			  const u8 * const in_end,
+			  const u32 count,
+			  u32 * const next_hashes)
+{
+	u32 cur_pos;
+	u32 hash3, hash4;
+	u32 next_hashseq;
+	u32 remaining = count;
+
+	if (unlikely(count + 5 > in_end - in_next))
+		return;
+
+	cur_pos = in_next - *in_base_p;
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+	do {
+		if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+			hc_matchfinder_slide_window(mf);
+			*in_base_p += MATCHFINDER_WINDOW_SIZE;
+			cur_pos = 0;
+		}
+		mf->hash3_tab[hash3] = cur_pos;
+		mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
+		mf->hash4_tab[hash4] = cur_pos;
+
+		next_hashseq = get_unaligned_le32(++in_next);
+		hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+		hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+		cur_pos++;
+	} while (--remaining);
+
+	prefetchw(&mf->hash3_tab[hash3]);
+	prefetchw(&mf->hash4_tab[hash4]);
+	next_hashes[0] = hash3;
+	next_hashes[1] = hash4;
+}
+
+#endif /* LIB_HC_MATCHFINDER_H */
diff --git a/packages/wasm/lib/libdeflate/ht_matchfinder.h b/packages/wasm/lib/libdeflate/ht_matchfinder.h
new file mode 100644
index 00000000..6437492f
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/ht_matchfinder.h
@@ -0,0 +1,234 @@
+/*
+ * ht_matchfinder.h - Lempel-Ziv matchfinding with a hash table
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a Hash Table (ht) matchfinder.
+ *
+ * This is a variant of the Hash Chains (hc) matchfinder that is optimized for
+ * very fast compression.  The ht_matchfinder stores the hash chains inline in
+ * the hash table, whereas the hc_matchfinder stores them in a separate array.
+ * Storing the hash chains inline is the faster method when max_search_depth
+ * (the maximum chain length) is very small.  It is not appropriate when
+ * max_search_depth is larger, as then it uses too much memory.
+ *
+ * Due to its focus on speed, the ht_matchfinder doesn't support length 3
+ * matches.  It also doesn't allow max_search_depth to vary at runtime; it is
+ * fixed at build time as HT_MATCHFINDER_BUCKET_SIZE.
+ *
+ * See hc_matchfinder.h for more information.
+ */
+
+#ifndef LIB_HT_MATCHFINDER_H
+#define LIB_HT_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define HT_MATCHFINDER_HASH_ORDER	15
+#define HT_MATCHFINDER_BUCKET_SIZE	2
+
+#define HT_MATCHFINDER_MIN_MATCH_LEN	4
+/* Minimum value of max_len for ht_matchfinder_longest_match() */
+#define HT_MATCHFINDER_REQUIRED_NBYTES	5
+
+struct MATCHFINDER_ALIGNED ht_matchfinder {
+	mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER]
+			 [HT_MATCHFINDER_BUCKET_SIZE];
+};
+
+static void
+ht_matchfinder_init(struct ht_matchfinder *mf)
+{
+	STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_init((mf_pos_t *)mf, sizeof(*mf));
+}
+
+static void
+ht_matchfinder_slide_window(struct ht_matchfinder *mf)
+{
+	matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+/* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */
+static u32
+ht_matchfinder_longest_match(struct ht_matchfinder * const mf,
+			     const u8 ** const in_base_p,
+			     const u8 * const in_next,
+			     const u32 max_len,
+			     const u32 nice_len,
+			     u32 * const next_hash,
+			     u32 * const offset_ret)
+{
+	u32 best_len = 0;
+	const u8 *best_matchptr = in_next;
+	u32 cur_pos = in_next - *in_base_p;
+	const u8 *in_base;
+	mf_pos_t cutoff;
+	u32 hash;
+	u32 seq;
+	mf_pos_t cur_node;
+	const u8 *matchptr;
+#if HT_MATCHFINDER_BUCKET_SIZE > 1
+	mf_pos_t to_insert;
+	u32 len;
+#endif
+#if HT_MATCHFINDER_BUCKET_SIZE > 2
+	int i;
+#endif
+
+	/* This is assumed throughout this function. */
+	STATIC_ASSERT(HT_MATCHFINDER_MIN_MATCH_LEN == 4);
+
+	if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+		ht_matchfinder_slide_window(mf);
+		*in_base_p += MATCHFINDER_WINDOW_SIZE;
+		cur_pos = 0;
+	}
+	in_base = *in_base_p;
+	cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+
+	hash = *next_hash;
+	STATIC_ASSERT(HT_MATCHFINDER_REQUIRED_NBYTES == 5);
+	*next_hash = lz_hash(get_unaligned_le32(in_next + 1),
+			     HT_MATCHFINDER_HASH_ORDER);
+	seq = load_u32_unaligned(in_next);
+	prefetchw(&mf->hash_tab[*next_hash]);
+#if HT_MATCHFINDER_BUCKET_SIZE == 1
+	/* Hand-unrolled version for BUCKET_SIZE == 1 */
+	cur_node = mf->hash_tab[hash][0];
+	mf->hash_tab[hash][0] = cur_pos;
+	if (cur_node <= cutoff)
+		goto out;
+	matchptr = &in_base[cur_node];
+	if (load_u32_unaligned(matchptr) == seq) {
+		best_len = lz_extend(in_next, matchptr, 4, max_len);
+		best_matchptr = matchptr;
+	}
+#elif HT_MATCHFINDER_BUCKET_SIZE == 2
+	/*
+	 * Hand-unrolled version for BUCKET_SIZE == 2.  The logic here also
+	 * differs slightly in that it copies the first entry to the second even
+	 * if nice_len is reached on the first, as this can be slightly faster.
+	 */
+	cur_node = mf->hash_tab[hash][0];
+	mf->hash_tab[hash][0] = cur_pos;
+	if (cur_node <= cutoff)
+		goto out;
+	matchptr = &in_base[cur_node];
+
+	to_insert = cur_node;
+	cur_node = mf->hash_tab[hash][1];
+	mf->hash_tab[hash][1] = to_insert;
+
+	if (load_u32_unaligned(matchptr) == seq) {
+		best_len = lz_extend(in_next, matchptr, 4, max_len);
+		best_matchptr = matchptr;
+		if (cur_node <= cutoff || best_len >= nice_len)
+			goto out;
+		matchptr = &in_base[cur_node];
+		if (load_u32_unaligned(matchptr) == seq &&
+		    load_u32_unaligned(matchptr + best_len - 3) ==
+		    load_u32_unaligned(in_next + best_len - 3)) {
+			len = lz_extend(in_next, matchptr, 4, max_len);
+			if (len > best_len) {
+				best_len = len;
+				best_matchptr = matchptr;
+			}
+		}
+	} else {
+		if (cur_node <= cutoff)
+			goto out;
+		matchptr = &in_base[cur_node];
+		if (load_u32_unaligned(matchptr) == seq) {
+			best_len = lz_extend(in_next, matchptr, 4, max_len);
+			best_matchptr = matchptr;
+		}
+	}
+#else
+	/* Generic version for HT_MATCHFINDER_BUCKET_SIZE > 2 */
+	to_insert = cur_pos;
+	for (i = 0; i < HT_MATCHFINDER_BUCKET_SIZE; i++) {
+		cur_node = mf->hash_tab[hash][i];
+		mf->hash_tab[hash][i] = to_insert;
+		if (cur_node <= cutoff)
+			goto out;
+		matchptr = &in_base[cur_node];
+		if (load_u32_unaligned(matchptr) == seq) {
+			len = lz_extend(in_next, matchptr, 4, max_len);
+			if (len > best_len) {
+				best_len = len;
+				best_matchptr = matchptr;
+				if (best_len >= nice_len)
+					goto out;
+			}
+		}
+		to_insert = cur_node;
+	}
+#endif
+out:
+	*offset_ret = in_next - best_matchptr;
+	return best_len;
+}
+
+static void
+ht_matchfinder_skip_bytes(struct ht_matchfinder * const mf,
+			  const u8 ** const in_base_p,
+			  const u8 *in_next,
+			  const u8 * const in_end,
+			  const u32 count,
+			  u32 * const next_hash)
+{
+	s32 cur_pos = in_next - *in_base_p;
+	u32 hash;
+	u32 remaining = count;
+	int i;
+
+	if (unlikely(count + HT_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next))
+		return;
+
+	if (cur_pos + count - 1 >= MATCHFINDER_WINDOW_SIZE) {
+		ht_matchfinder_slide_window(mf);
+		*in_base_p += MATCHFINDER_WINDOW_SIZE;
+		cur_pos -= MATCHFINDER_WINDOW_SIZE;
+	}
+
+	hash = *next_hash;
+	do {
+		for (i = HT_MATCHFINDER_BUCKET_SIZE - 1; i > 0; i--)
+			mf->hash_tab[hash][i] = mf->hash_tab[hash][i - 1];
+		mf->hash_tab[hash][0] = cur_pos;
+
+		hash = lz_hash(get_unaligned_le32(++in_next),
+			       HT_MATCHFINDER_HASH_ORDER);
+		cur_pos++;
+	} while (--remaining);
+
+	prefetchw(&mf->hash_tab[hash]);
+	*next_hash = hash;
+}
+
+#endif /* LIB_HT_MATCHFINDER_H */
diff --git a/packages/wasm/lib/libdeflate/matchfinder_common.h b/packages/wasm/lib/libdeflate/matchfinder_common.h
new file mode 100644
index 00000000..07c44673
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/matchfinder_common.h
@@ -0,0 +1,194 @@
+/*
+ * matchfinder_common.h - common code for Lempel-Ziv matchfinding
+ */
+
+#ifndef LIB_MATCHFINDER_COMMON_H
+#define LIB_MATCHFINDER_COMMON_H
+
+#include "lib_common.h"
+
+#ifndef MATCHFINDER_WINDOW_ORDER
+#  error "MATCHFINDER_WINDOW_ORDER must be defined!"
+#endif
+
+/*
+ * Given a 32-bit value that was loaded with the platform's native endianness,
+ * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
+ * bits contain the first 3 bytes, arranged in octets in a platform-dependent
+ * order, at the memory location from which the input 32-bit value was loaded.
+ */
+static u32
+loaded_u32_to_u24(u32 v)
+{
+	if (CPU_IS_LITTLE_ENDIAN())
+		return v & 0xFFFFFF;
+	else
+		return v >> 8;
+}
+
+/*
+ * Load the next 3 bytes from @p into the 24 low-order bits of a 32-bit value.
+ * The order in which the 3 bytes will be arranged as octets in the 24 bits is
+ * platform-dependent.  At least 4 bytes (not 3) must be available at @p.
+ */
+static u32
+load_u24_unaligned(const u8 *p)
+{
+#if UNALIGNED_ACCESS_IS_FAST
+	return loaded_u32_to_u24(load_u32_unaligned(p));
+#else
+	if (CPU_IS_LITTLE_ENDIAN())
+		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+	else
+		return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
+#endif
+}
+
+#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
+
+typedef s16 mf_pos_t;
+
+#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
+
+/*
+ * Required alignment of the matchfinder buffer pointer and size.  The values
+ * here come from the AVX-2 implementation, which is the worst case.
+ */
+#define MATCHFINDER_MEM_ALIGNMENT	32
+#define MATCHFINDER_SIZE_ALIGNMENT	128
+
+#undef matchfinder_init
+#undef matchfinder_rebase
+#ifdef _aligned_attribute
+#  define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
+#else
+#  define MATCHFINDER_ALIGNED
+#endif
+
+/*
+ * Initialize the hash table portion of the matchfinder.
+ *
+ * Essentially, this is an optimized memset().
+ *
+ * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and
+ * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
+ */
+#ifndef matchfinder_init
+static void
+matchfinder_init(mf_pos_t *data, size_t size)
+{
+	size_t num_entries = size / sizeof(*data);
+	size_t i;
+
+	for (i = 0; i < num_entries; i++)
+		data[i] = MATCHFINDER_INITVAL;
+}
+#endif
+
+/*
+ * Slide the matchfinder by MATCHFINDER_WINDOW_SIZE bytes.
+ *
+ * This must be called just after each MATCHFINDER_WINDOW_SIZE bytes have been
+ * run through the matchfinder.
+ *
+ * This subtracts MATCHFINDER_WINDOW_SIZE bytes from each entry in the given
+ * array, making the entries be relative to the current position rather than the
+ * position MATCHFINDER_WINDOW_SIZE bytes prior.  To avoid integer underflows,
+ * entries that would become less than -MATCHFINDER_WINDOW_SIZE stay at
+ * -MATCHFINDER_WINDOW_SIZE, keeping them permanently out of bounds.
+ *
+ * The given array must contain all matchfinder data that is position-relative:
+ * the hash table(s) as well as any hash chain or binary tree links.  Its
+ * address must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and its size
+ * must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
+ */
+#ifndef matchfinder_rebase
+static void
+matchfinder_rebase(mf_pos_t *data, size_t size)
+{
+	size_t num_entries = size / sizeof(*data);
+	size_t i;
+
+	if (MATCHFINDER_WINDOW_SIZE == 32768) {
+		/*
+		 * Branchless version for 32768-byte windows.  Clear all bits if
+		 * the value was already negative, then set the sign bit.  This
+		 * is equivalent to subtracting 32768 with signed saturation.
+		 */
+		for (i = 0; i < num_entries; i++)
+			data[i] = 0x8000 | (data[i] & ~(data[i] >> 15));
+	} else {
+		for (i = 0; i < num_entries; i++) {
+			if (data[i] >= 0)
+				data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+			else
+				data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+		}
+	}
+}
+#endif
+
+/*
+ * The hash function: given a sequence prefix held in the low-order bits of a
+ * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
+ * bits of the product that don't fit in a 32-bit value, but take the
+ * next-highest @num_bits bits of the product as the hash value, as those have
+ * the most randomness.
+ */
+static u32
+lz_hash(u32 seq, unsigned num_bits)
+{
+	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
+}
+
+/*
+ * Return the number of bytes at @matchptr that match the bytes at @strptr, up
+ * to a maximum of @max_len.  Initially, @start_len bytes are matched.
+ */
+static unsigned
+lz_extend(const u8 * const strptr, const u8 * const matchptr,
+	  const unsigned start_len, const unsigned max_len)
+{
+	unsigned len = start_len;
+	machine_word_t v_word;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+
+		if (likely(max_len - len >= 4 * WORDBYTES)) {
+
+		#define COMPARE_WORD_STEP				\
+			v_word = load_word_unaligned(&matchptr[len]) ^	\
+				 load_word_unaligned(&strptr[len]);	\
+			if (v_word != 0)				\
+				goto word_differs;			\
+			len += WORDBYTES;				\
+
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+		#undef COMPARE_WORD_STEP
+		}
+
+		while (len + WORDBYTES <= max_len) {
+			v_word = load_word_unaligned(&matchptr[len]) ^
+				 load_word_unaligned(&strptr[len]);
+			if (v_word != 0)
+				goto word_differs;
+			len += WORDBYTES;
+		}
+	}
+
+	while (len < max_len && matchptr[len] == strptr[len])
+		len++;
+	return len;
+
+word_differs:
+	if (CPU_IS_LITTLE_ENDIAN())
+		len += (bsfw(v_word) >> 3);
+	else
+		len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
+	return len;
+}
+
+#endif /* LIB_MATCHFINDER_COMMON_H */
diff --git a/packages/wasm/lib/libdeflate/zlib_compress.c b/packages/wasm/lib/libdeflate/zlib_compress.c
new file mode 100644
index 00000000..b486c33e
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/zlib_compress.c
@@ -0,0 +1,83 @@
+/*
+ * zlib_compress.c - compress with a zlib wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "zlib_constants.h"
+#include "adler32.h"
+
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress(struct libdeflate_compressor *c,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail)
+{
+	u8 *out_next = out;
+	u16 hdr;
+	unsigned compression_level;
+	unsigned level_hint;
+	size_t deflate_size;
+
+	if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
+		return 0;
+
+	/* 2 byte header: CMF and FLG  */
+	hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
+	compression_level = libdeflate_get_compression_level(c);
+	if (compression_level < 2)
+		level_hint = ZLIB_FASTEST_COMPRESSION;
+	else if (compression_level < 6)
+		level_hint = ZLIB_FAST_COMPRESSION;
+	else if (compression_level < 8)
+		level_hint = ZLIB_DEFAULT_COMPRESSION;
+	else
+		level_hint = ZLIB_SLOWEST_COMPRESSION;
+	hdr |= level_hint << 6;
+	hdr |= 31 - (hdr % 31);
+
+	put_unaligned_be16(hdr, out_next);
+	out_next += 2;
+
+	/* Compressed data  */
+	deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+					out_nbytes_avail - ZLIB_MIN_OVERHEAD);
+	if (deflate_size == 0)
+		return 0;
+	out_next += deflate_size;
+
+	/* ADLER32  */
+	put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next);
+	out_next += 4;
+
+	return out_next - (u8 *)out;
+}
+
+/*LIBDEFLATEAPI*/ static size_t
+libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
+			       size_t in_nbytes)
+{
+	return ZLIB_MIN_OVERHEAD +
+	       libdeflate_deflate_compress_bound(c, in_nbytes);
+}
diff --git a/packages/wasm/lib/libdeflate/zlib_constants.h b/packages/wasm/lib/libdeflate/zlib_constants.h
new file mode 100644
index 00000000..f304310c
--- /dev/null
+++ b/packages/wasm/lib/libdeflate/zlib_constants.h
@@ -0,0 +1,21 @@
+/*
+ * zlib_constants.h - constants for the zlib wrapper format
+ */
+
+#ifndef LIB_ZLIB_CONSTANTS_H
+#define LIB_ZLIB_CONSTANTS_H
+
+#define ZLIB_MIN_HEADER_SIZE	2
+#define ZLIB_FOOTER_SIZE	4
+#define ZLIB_MIN_OVERHEAD	(ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
+
+#define ZLIB_CM_DEFLATE		8
+
+#define ZLIB_CINFO_32K_WINDOW	7
+
+#define ZLIB_FASTEST_COMPRESSION	0
+#define ZLIB_FAST_COMPRESSION		1
+#define ZLIB_DEFAULT_COMPRESSION	2
+#define ZLIB_SLOWEST_COMPRESSION	3
+
+#endif /* LIB_ZLIB_CONSTANTS_H */
diff --git a/packages/wasm/lib/mtcute.wasm b/packages/wasm/lib/mtcute.wasm
new file mode 100755
index 0000000000000000000000000000000000000000..9d13ed6ad354f9e7e946c79ae8d3fc0e7b080a6d
GIT binary patch
literal 45120
zcmd6w37l2)8~@KacVA}a&NQvla_((0REqYEIceV!lC-aP+Gc6A&$Osi2vG<jvV`ng
zC_+d=_7Fk{MF`RV{d~{4cb3M_@BjPD>wkJZbMHO(obU2{pU?9=-_Q4ZZrX@~NxG(K
zdO_N7y;vLWF6PjN8}i4c-g@K=pY&qASkCj;t4?t)V87unXVtB|+Bfh<?&rmyxJWB~
z%V|w7cAeJAqv`5!`!p@&_XXUVW*A;AtQl^X;Whl4%VW5FnoD;DG|i)>q-x$^kW+5A
zSJT}deX8#DdR)3@c>OfcD*4hK@;zasc_)pTG$ntAq4|dmn>1qL#3`dR%{y$^*!(eL
zG(BtLgps4ijGZ{5aLh3E-LO$pCQZ#BQ&2D^Uu#;c%(rrV*)LD7Q|3$i*3o0i+}yl&
zneWxj<-R_pUYW1Q&73fG*tju;!=_9toI0&=SiyvuW3*Gt)~>E7EkJ8gqs;9yd7o^W
z)-rX%xG|@;YCUYs<Wc!ErWR^#6DPP|ooE*xRhTbV*xG0xay}}Rbnx1rw2pz0u336W
zH0<$)^+52tKlDz^blp{ax)#+f&D3%&SDvZGV}^C$3oYBwVp`k^$73#2w<3kT-8rVq
ziWKqT=uuZ*uBFFwv|7%UQ7vR@rqSA<6w4jAp4fQFky(6_uQ}t7$M`lL)8ztPT_@i-
zH*<r!F?1RBQf%ayTGtd+PHs%66?S3WIi*_{-}~l!Go5nziaIIWzAR2LT%>BW>k*Ub
zo1(dLxY3<U3#3oH@<EDKWpz){6K$jz)jYYacZwDYg+B7?>BaRmE0P=YH_%MK$pv94
zB@%U;eoJd@_^E$$vv@4UDk<?tYFqP-Xpldq+jR5NKP#Fl7u*+)N2{ne{&+M^6cCSw
zOg-wRX}pR)<MJlgTD;!fGb@@&RcWmaH@9TQEzK$k9tmk?npDV3G~miXNg<Oy8P+_b
zF9Ty(ZeA!(Gu=%!jlm2=e6&H2(^A9Ixso1PS~M6oLT0LI@GrHuG>-;{%uJ)0Ci`Wu
zJ*Xz;lafnHjBXU25@}+3OgSxQ;xV77UyD+VU&d9wh<drgovrECQ?I>!P@ia}7fDM~
zOs{1W#$(}=buAi^3$$pu88(d$7@`*S(20m?Y@yh>1+k1i)a8yEVLfDWgF%^LGtxmf
z%yiRdMleclm-a}1+-5LxQq*I5&5S-NI+dsP=f!LKXdaiD=?p7peR0c`E1%t_$9XGX
zxJ{p3LQL9CDLrG}EK2jC6yB&NNXexQqFv=nmm-})<VK|pX<4rH+Y|F~t44W#(He$>
zX-53qz&HdfkBqepP0W)kV;7(aO7&i@M)7fLUWt*5QOuEVMg3k=zEbL!@(p>HD|LEg
zfaPumBCG}i6G*6?yPD~mXwa)vE_M*7BOWwTI#4opb}0eXJj?{ed#Q&3&Nkf9fOIu3
z!$PN<mb0x;Oz=eqs`O0JLvjJGg3I+>GmV!}x0oIYn0i;GNZoWb)goQhg?=j%Z=i*?
zy7aK%%gl|r!525SHfk&6v^Hv}R|PhP$*a)R+Q>BBowYWu1nh#5AQ;)aH#;5;^JZmG
za3z3?#Oas9na|Xt23m1TpJ-D=E+~U4uZ=35(bBCw>oel=CfC}xBco?lEJeBk-oue_
zEEV+tUa!DUOeHtwalq%cYQ#;CjEtL-V}3R4yohc2duNF%4Ti-K*B0cO+HnjG1!YSo
zv+0x6z<Gb%bVpKRSaVOb>akpX<!deGw@S?3d7R=TV~vJwdrFkW@X;VfF;^85NtN4V
z;23LX;0zf!T{#7@f>6w`2QJ*7V(~m&>Wie=gJX}PAzjCCOkbH%G@@lk(QTPBimB2t
zDZyoFkx1E(Iv=<A^-O1sy^OI8qztmW+JlRhoWZSOdS!4Ue5tk9(qtfUt!Y?$jYw@%
zi=@k$moIb0agg?4N7NX{!ZI{f89#7AI}Ca4pB0b#xNI*rQB7{V&y2^?F@HUl5(`8_
z3}V;}*dxvxXT+l!sPL$s=PtGu7#=J;gRdM52t}*owNlLxo{OP0a#RtXTs$l;pu4-+
z^m)v*NQUWC;lK_8c1uE0KjqF-uG%Z(?-pzEVmC3L>60<WS@X|lTMO0ruoXaGJLa}L
zJ;9F=bqfU3K)1(CVW8#P8n)$lL6d+xQZI%Jl7YoQa>0JoCx+)SLt;-pvwD9DiZIZ`
zBAgMf+1mc&ZSHt1WQN4Q=qe10+l+>!vWV%yY$BOrTAadZ@ej{r6yd;mE8mKH5pxv3
zB3HRZF%<5ytCU8{yQ(I73fh-Yfs9ub)JMz_z4>ju`5nCl&7d=aA%-wmR&T+i-jXN_
z5-fu~70Q&xERS?EsVHfIK(bU-1XL|juT}_^(R^{I1_Pq{A|F&=g3`idd!0DyV-&0K
zMXU-=QWWGz!KNF)W=Qxk%uHSwJ3ktzCJHTaD<F$?EjghFY6znSYw-!aqJD0cBnK7q
zZOk31Do#?1R5Lx%REj1>14aV1QX7LW%3@HZCk!lOCuT=0<FTN4Y{6`R%Q05Gs2S*H
ziFu{PBh?9?44UK${v$yRs77kgM6hRV0vS>+Uy@{qVx{v0{%q1HinR6xo%5RY?(f(M
zid*iNOe7WGLR5<@P)R}|ort=*hYr|HkvgRvd=>7NXbN?6NjiNDnCYRYVTf^tyP*dO
z4ZL`pYH0!|Wq%=VQx2vhSs}&h%3^V_m|>~88lpGx{Cp2^Bbgyo;YxlVx=j*XbpwG?
zJU>vwQTXEwxGHcVvC|2#HUNB92|gs(s#6s6T6!T`5l>iTtq96JT&!*@2_&%V;Ko(j
zkVq8?%_==4yXc@xnnBVd@bTc~Mb&mjMN6s|=JPpps_LV)vSMST#WZ_=FdnHXIzX*1
zDJIe|6l5^LNjOqF)Lb`;8A-mE@pHy9+ZjEQUo!8+sN!Lb0imTjcMy5WBVw)w=%2AI
zF&JCRh5pfx8xRq8sE`jPiL6ffE>jZ?SbBR+>IUvIDgZ*qV6ZBHrLNOyiAyOf%3Co~
zDP36=9f;HsQ*m*I=wZ4;w|R7Lv6u^k77I&;gwvEvPm45``W2ES^9-XXY6yr^OS;8?
z08!miuq7%$5O>P30SrkyQ|vP!7;jfnf+kYsv`!!n2>5}FypY5=SA;PnIYWdep`|fL
zj~?~qmF7z_cs{OnGTjK7uI=+&#A_NIGE@5TVnhRBdMQb3G!>-=Ipd?#Dg=bh6fquR
ztE6J80yEWq?G_7UMRA{k=PXIhbu-i_g%nO4lAZVmRBEhmiJ{j31(>OVAQ_3H03g{7
zx*1b96DrirrZ+{WO*D$;V~dfKnAv&yr!cC2d9O>|<PuCNw<_J@6E9A~#EZIeQFDMU
zx}rW8Vau+~M<=ECE(w$dX)eun;Q^h7CzEp-Y1Ivt;FBKyhvp>opju_n6;WccGZH7I
z86MLW^Yu*;fAp6f)65`!@I@0H3?)8`Ir;c17^R<nDPHCDuFyLI8L_)K-7r+<ZU_YU
zplp<cTxLmbx~b+wq6*_LOxZSM94EFU17_#v;(%>4#R$cJCvDbF<T+I?k$0h9yio-M
z#mu~=)+tK2TL}aRG~(IuNs<Z^hsE=RIysYWYo``hSW{7pP9>t2bXP?!H0mZ?(@wsO
zRK>-GLhFNi74am^$%lcnM{;;3j4I8`R<CxNSDM2&Fg=py5eig(7cR}=t8yySo+?W!
zFIk>S<()K71?6g{ozHvBh)f4EE9UbN+x_X*iXDtG?G)!OANr9Hren37v$H<0ljK*I
zOxI5GN&h0zR0d8y6789MN-~koP3cmQ^c&zZ(d;3mS<HM9W%OnhQJ&;ZqJR91mms5D
zp3e#vMFVB7l>+Rm1HCJ`-1OriGcZWhPAPWQ=&`cnYF36y8zdr+Y6^`>%rT4{@>OyS
zs|glmW-#Dlii|?G(I6JUnmuZ!Lb}K_!z<p3Zz8I}E}0nnta+O?naar!OMD~)fksef
zu4cvRou6vFW`LOkZ`_i%RHLO)KWg)rZLMEirD|;efaBYx1%5MNw>H^M2@e4=_Ogv7
z`DX+yy(rP-ur!&OZBKk5FC?Lbrf*!XMGagS3ysT#yz0T1%9H1oHlFOawGokN6HUZD
z807D`CI&N&<V@Tb81|FIxWq}3ObYc;M(2vtD&Gs4V<dV(x$$T~P3;`T`&D0PYpLqd
zXC=MCnMr!cOU2%#IlU3)fM$zx#TJ2bns_H7*I%~L&g5g_<LRiQX+H@r{dU?HkSRpc
zDx9Vn<rU3%U<PEAl{d}TgD>sIl`7g#I}N2`+9^8L%#_k^m9t@HCvy><5F#dq<zitT
zAP+g>C44e`N$U;D8Nr;`EK^$4L?<NKtZ-xE4$0!>CIdSvcZ)LTa&;v~O0EadGIwz*
zfuJ)azc83BF*qnHGnQ&rmG_xA4+G-q*l98>mKprZX|l&^F4N>%#AwM=mBUi=E=#jk
zN65JnnVcD^8#<Az#99VmSFMc!+|s}@@mTN(!lCkw<Wu6en^(M&t2{-PH;-36FJF?>
z@;BV6>z-ml7-bBMd4R1{6ZJ}%qE0-PAr8zlk*okuhri+D#2D%%<y3!q;`8j;9<x$=
zlHj$rLH7l)o{19ZsrvFA8sw?O7!q~s=;`u-Y~~)TWHXiI-ZQsz%98a&&5$M9BwSeG
zqKSl5rcO-Bw1_D|rH?LQMrq3wZ*1qa_GDFZdva7)BF$A(6Nb=DvSny0xpp$%<;wqt
z4)~0eV$WQg$4w_jK?*Fm-7VdN&k+E+B2j9N;r(6KmpkO|Jkm{^m3-=-74wU$^rHm<
zyQHqJLX$Lj>-)k**nY2sW4bsz;o?96-OG)Ia4;dOSZ<-x15Tw<m}af`^j1_J;N8O4
z<z3KPo^DUjIJF@fkJP7Wr23Htd{#BlUctU|o#igFWTGCj`xT*Il9ff^aQQ<UBziL2
zNx{lW)U3@F^Gi3PK{Sd>5p0T$q?)OeY#SA&i#szV(yECiDUcN^g3~7L=C%O%B&(B#
z5V`myfi9c~blgnRN`V+wI8ui%aMI{WjgX3fDN+Wv$AT1tzhLHWh9@e=Oi8LhqCpt$
zMIxtVP`hYU>%|zDDHPX}TT*&vkzGvWOe!<bbeU&};vvngEmabqBH0E#HF+V98#$)3
zcd0LO0zOo_gL<|J`_T$BU}`0^U3puLCRQb)5l0|st+mc2b7R%bnz#uj&0IA%R%4D?
z!>m5XtZvpgH)}TJ6Im$3X~Tm0s+l$CcHXAX?HpJVtJc{iw^tn~1D|YzS*>$mu35FS
zu_$@oaz(Pm!*gwCeNn7N=@*9mMK!a`Io&>|N`%qK#-~OUuS#e3lSawXgR(8vS7mPu
zWU&}54NMwWBnE-Oi0u>eH2s3tQY+)KR4mbhMW=~PqzQeC_R?_OaC904`N=s5J7z}j
zELkFwH>#fwtknJZd6_b%n=V;0Nf+}>vt4-wTbJI;G=0B=71JM=7%3K+gogtPDOiGm
zxDgk8k2lhgk`=F<Q*Jt}h7gCw&|!LFD&=Rywi(zKU|$ewtXXRf3k7m%iBJoXT9#=E
zBgwR%b87(^-vr{3TA@WDJxvne9C)+_8gEJn&pzOYhPc5f6vhO637;e}oP`fej2uvK
zAOKMGm4YH3hm#HzC5e=|gs9yknF{L-`6}P!l=3=K0%OZH(y7Gt*sU-fpaCc;4=Aap
z8gp@w$_cPCQLf@dg6351#+q>r6cH>my~0n(xDXcNYS1NVQY@KNN+n*E=E`c7BxP~S
zFUxCWQQ}=#`e5ElcZ2eg^LA2eNLtO!MT(o3J{XZKilwizP$krla|uO4G*O6w)}#ZX
zMx`g|G~JLF3Xog*qe_vh<qWn=QSdnd0<eJ-pgB73m$%{y6^q2=KBi&S=oB5h?Wp3i
z?bYrQe23H_l!iKO`wu2R%+qyIbFTPM3eIAF;f@*6uqc~4_*%%SRDqPlO<Mt0$@DxK
zS2L}<=zwoLWxnYx*|(hZl{qP%k<ZK(sGEY*<Q77POn4RRTJZ*E=@cU+%kW4^u?!iL
zDyniQU^7*skmN08>xq%nb6*DGp^AlKR4{egM@LeW#w4mcU$Leq1*(V<^(3m4+Tf@q
zchc~niHkjQsTd~<N;2h$dSszVR;}a`GazQB0t!nxET`fkQg9e}9hLx;fsw{SGUC8U
z>D8PIfXO}JLS|8tNph`6T}$kcDXyb+nj?*odgKN*6K3KjJdKcwW=I*B1adJOX}`MF
zZoj|CIwBO17%ww9HKu|K)ic#-XkZ2f+AOB1da)vVs4-Qy(hDWixLjH;>=dVIb1rQT
z!U1EU_&ldp3A}a_i6#!uX{cdv3|#9ccVK&d+vUidTXz<c6V=ANbWCC=tx$?`*`%0s
z!gPxxc2p)V0?%$2ty)rsB#=2eWlOWsRg}C&l|>h1wMnUrz6dz=!kC)tIi)6xlU@t6
zt6?&tR9*2TP?0vz%4FI?xhujFvw1J%mHol;7v@_SSFTz{28@bjaeR*8O{1~sEGL(|
znp8`@uF{3*1!)Q8%lb0GL7L|(T|%KzRA$1}6`L-aXX*JUS~#$T`gjXZr<_&qES5N(
z_d=ElXdvV5(4{Jph{mmWJE_LhS$$zZox+r34mDI-EQ;z<wGcvza1=PJFKqTSG+bDi
zVu(M@dQCFuTD&<B5!H~}>Q<UMMc|?dkauM4R?}Roe*dgAC_*bBXFcj{B-h%k$!SZt
zG1(nqZR2w&)J7O)bni68)Px{3EKwNwm9#6r(uTd^)&0d1t=s@J8FEl8@lF`hV0i*c
z2vOn!{=O_jibhmOiFv72#Vn--KC%CRt07^XbvxEr4cxOVjs(oKKD>}r;>5DG1XD__
z&f8c(Txd`(lgTRFPT31#G{pJ4Y&I9Knk$vqMk+%Dy`VT@_7()BRkWg%hcgO6J$OMw
zEI*1bGOYRXCe_H1f3!KR7nS>|SwWjDRi(I+DjSTnA8oNEAs)lYs!<d+DWu+cgz}W-
zHH?Ot4?HY>s(*@%sv2-0>0FABWWwebXiy7(#g3sr)Icd$Ww%l_E#_4-aB?N_#Hp2B
zB|YIPwdunQu#PEdlbC_GAGM49;OyBI!Zci9?>EC(v+%vLy+sXJYT}I#yL0;OQvH^A
zSf=l2NcEnsNS_lO!a37(PL}pOAPbh*YGRoDWrhi`$tpI=F+>sgw6cXs4)M3$RAK3F
zROT^*A<1x)%Ag`D<L@!bK(n%>AW_0G5P@`!xY}tU%L$qq3+JO~sS@3V&c;=VH&F9h
zi3m`viTP|cHiti|IGMHQ+iJ?LQwfm|asVB3W*`A#U}(J1(=z3g37^_9WS}2KsJf^N
z-^4hGFT>rHSw9nR90kqdR_)Y_c{i(2x^mWhDR7oj_@h>#LeCoT2+Wro$(c+k4j!YK
zghy5+sC=!kq;}G~u%XIDt+ncF%QewT#g$_zIxWK1;?`<il12Q|ic~eUph;fi6Unxe
zR#c)XjU;YVk=Sl!+MX2M$(?Slb}lA(v7$*?#7^B>t#V)5_H8xJQYRXA>SZbOtyHJp
zjfr~Qe_d}Kl~b_#SskBUrX<WdZXHf>O4@0cB!l{wJ*cZna)oy5Hp5}h&kUsvf|A@w
zz#-AfvfPEDc_-6$i2_`fvxo+J@E)82J#s~5Y_S(fnAFYGF6@Q8_fU{0i@0X@ktC;*
zBJHU*N?S`n63bfgi4^UUZc?-urqeNuL={UGWNKN7(p(aj<tiFzSgL=aRc`jpQIw)B
zWD^a9Nz4UnZQD50Qq}y@398UAWY?Q)LWMio_#tUDkVFA2Q4%eI6g{%zGQQeJWTQZa
znl*GQP)HnCkyvO#;<;SyWfzAFC+D?_3)uSMr92W&v}NabTw}xnaclQ;bQpa}+1Aec
z`5=hQ0L!X6;*${PE^;YZXbcqMlB9TN14BrRFeraA;tY|@ebu7AY7L92@U^0$AZZFI
zt#X=@9#@e>frg+pny0LS#Rh2tIY|f)%!Ux@4{4V;Vi;@31_f|{%tm-Z#gZXN1KEXV
zn>f)Lr3H#a6&AZS`Ji_r<B`2Xb{&qv@JTF27QyYED6~i+o)G~iYkn-Fo0S1J4wZ8r
zVtCYx!?M%vzjGZ2s_7{-LUIX~VMNUYNug-45g}|KjU$cdoeL++9z>?4maAV@ZZu-;
zuaDaCEw-OwMU_p4Oph5-@&x4w&=vh?C?kp1RwyvyMlc62pH-ska5RrKaC9jwNdSKm
z!c6?CsE`!JmLmv0MoU71@MKOY4mlPQqYGKP+)gP+s!A!l+-gq|;ms)`F!67TSZTW`
zzYuw(;q-5RHEBOh#(mK&Qi`(8LLD}gV{C<ztSG~}r@p$usVhUAohTqZZl+b*>g54x
zwG?cvw0A)zTTJOT(qri?aylhSHRAXFbD=?{3K>a;jf4!1V;O(4t(D}^HQCC>LyMu1
zqXu&=AMCV8Mwj&%HrdI1H^eLKf^66<k{kzz-(HIlJyA)0*)y9$Y9#56gaw%bldO?N
zp&Z%x#=1UP1*>J?f;~4eSC+{HEu3#}Q=kQ=D+>h3wq*bjw7flmg3QjD%m||)D_;h-
zjmdXFd3XZSGMt9}fziefn=Xf)r2*Wea-A@*>MKD&2pupnP<#F8zvOuI7QaM=db}{7
z_F#!Yi~GfrHxNt0bOdBe9uu(q%)Lz(dL-Xsdfb|Z1W=U%KQJhgIWE1nrsow!y;P+7
z?v*ZjnMNs(h>^?I1RNuG_7(>t<&m<ehOkCqHzXC8%KH>)R3dZ}Gvu7hKDWk>*DT2e
z@mi&A2XeCYa-I5`Wv^uOaZ{}8{8f|0i9x-nCP{Og?!iJO`{lR$#Tn_c8h2KJNkehN
zHn$?lrF5EmY=SFf?NoMIih3cdqTaHt^7J&}F8{Av=1C|XzwapCV>eKyoA~UIsZ5^c
ziH{QMCsmq<Zp2&#wAeQP;32U34JuWMMXY*xG4@ogwi5(E98CBrJhyP1a`ASx3UB(Q
zGZ4C-A~U_PT(5-1`X99YU-T(eU`!`y4WvH$4_SxtyVmcGqZ(bJjbwrJ@o;@vU}fe<
z5_}*{{9UefxN$BRl+Ai9DG-#o00=U6^oa{pIMhh)xWx8hrd$%YA&TLOwS%+(1xYIy
zM7JP_j}knc=FkX9T@{ukss)q_U;u<;WYI;fTF2}Rk`iT|&#vB9XIa~jZMFY{7PW+B
z5TqQ-&=QtmU>QX2fKnnEnfM}>G=*w)S=Eb5+okwBn+;XN45-}AbT4TXFYG2tI0a_!
z2~_z-QM9;7{DIU%s1#Ju9s#<ID!(`l`ND3RKvlejyn-s+0a=zAjthM)y2odPX_-;T
zfC)#6W=#gopC8pq+kkUPv_S}eskI2zI&I)vp<T<gL9^Sy;;mATcqeItywV1*ID3~9
zXT+JLS_e{<0hwObiptsf|1>`T$*0H;Yy7{=J6K89aMjp{ouiV{sRSx%X^!QkVsR{3
zxzRJp6f2zeKeQm`$Am-NJh9H6*^3q18dbWq$Cgx;C94$ObS%zIkvPC(9XI_g^^)X`
z7zHa_YM|71@$#lDl`?-yBxuy^RAb118LE;>jD~Mz^ME~IET%iI_^-`cxGt1xk17)m
zHL5a!w=IjJ@}gV<)l|{4B%zK)l9nYze!klC>4dy8mSu=GXd24_e{EStdCQVCUfLk9
z3Cq&TSe6UtS;n>uiM~$F|F~sAX{$UKA8xx)s1vKiK^ZocZIrOKqr#IKCyDq(*8c!3
zzGVfr5?~Rd9bg3=U~yHRy;G3=Ts)&BOCqufSjh3vDz9YRYN8H>yR3RNxin^dlgY%Y
zwXwPxJ{$9&Z_pfAN2UXke&t4WOi%b}K7E~;8)6Q9ciG#QRadvKX+{vliTw4=ORIA;
zu8kR{Z1#4O7SkYd%0x<)O&h=^OvajKGW%iT5p@wx>4Jn){EJ3Z6h=Yi@q#d#C&6}g
zLl(HzdYVqDv=_IiV`D<DkdX;EWj||3j2Kc&GA~2Iw_EH?rS@!{6uw@kiIDXW7jm}&
zLc4OWVTBke%2NA-dvIrKqX(Yhs2FQV_Kg_4XM=!Yc?zuz72`ZI+hZQa06-3eU?K!>
zL2=Tw!D3&$Jg5YZ+C>%DMQAs<`!&secF4J1wb)WS@no(`6qDNU(CRJ%soK0JDT9PQ
zhs3e#PP8{iT`DnD_J&I{r2a%}BbBt4DLR|}47;t;12^iaMfl>Z3T;kP?Ct1fStO@9
zPUq1vwTWgW;|#$ivAv8>Ds!aiDO55I>$Iv^lGDNmG!&C~tO4t(R)&nbjG#pexSi)8
z#4dHo_An#cG%-_ISV*LPj+v6fw>H)`6H8j%j3hw}E{B4Zttqn;6O%kX6uMcr`7)b<
z5vPoaV)-)p6;6ZOo&9?WwuJ~0)zx(@{o4D-?Mwz<g|SZDXtz-qD2G}pOQ|I8->Dos
zH(ERFEh`R<A|2*Vn?_kzu8AJ1hNPv5P{t=>v$T=rw*<>&GeWo_)kaZWC2dUJ2`HSZ
zDm2n9xKlNu@Uo4p<W{?tW3*&cG*VrfZUA6ShJZ9=qHqg_tO&?dPMwr9dbu;|GRrej
z=6>a`<QwPq&><blR4QVaj9e=s9uySH4r3IeUJGI#!EEXBj3~&tK)rGS;1le^cKpRR
zJONqBHwOC(qH4+|OqZ0a7L;VylJIZR0IX3ph}Z5y<;y_gR(p!wyXd`KgI|y~o46lo
zw>*ePo0Rbi%_U@!XFOUE2PhsZ6r@b@$Os$5oVR}7Zt_AN8uTB>5(ca+8<i})UFr<*
zsg|{8Z?@ELzrQaGnI<2*WreKz@rV}FaKajIt>HooF%?v+s4m&f8xw~wZIlS2S#7wF
zCryYdqWQo6XBw$t8F-#6_W1+BP?*6^P<_SR9Zr!GrtHJiEJ0XUuIRjo40)~I2GW33
zLC(tDqw{jk?kRhgP4HUh6>`>9=Gs8Tv$7BumYHMcTe)_H-@aCz<tZJl^IoYfQsykV
zsMdMIe9oHoSs~oQtV(O0-(6VYEF)u=yD?y2tF~Mao@Ad&XYQAQQoG7JQy_^yXT*iu
z)$Ylt<R<A+$sat8QF<%2mXsKBE0g5%Z7WybJh|>uT2gxJQqIshMeWG4pUMlxPg@wC
zgXBV7lURw&+EmgF;cHv3q+j9>q7usS-7c|O+k_ooB&<8dX2!WeQUo<%+yfT$QYvg$
zB66!@T_7EmCa&6?2m&lGL|u-bp+cDCGFR;ZJ}Oiks(6*!iR;v8Iu}7nmM^Mog(bRk
z6<muvJrhN%dMI5pH5C=qQ{WLj6xtdfN0N;@^gP%AT#6<ncID(cyZaPE*vcX^mgLSp
zdLX2sTn4)BfFN<&+0>9&<d&Sp-iKpfYLm*`34&$24{h;8!!qJjK!uhLDJn~Au-#yo
z_L=fHv5%Pr+^ZSxofQ*(35Unb(uNsN%w-!n@@O`xjN%Pt2CsI|sFgh77IJcB|B!Tx
zG+crS+bF0$##S!4Ul{=*Ho3qF;&5)VKnm$5^Xr(*YRxcT%h@O>QT<l@2Bj!whCHE+
z8WImS(tRmhQU{7N{j$!-q&TS+)IdvcmX>aY3u8JAI$hI6A<$)%ty-a$T`6BZKa=Q;
zp9_U+p@EXxi8e_9opJJsU*!VH#bjR#Z&OrO#j1dYF{}FMSz&jcfT14pF?p8?wJ)wP
z%J*C){@$h#$dbj@bcZQc>;9T(jI`8r7pWdaW!+k`P5Qz@fsBu>c2d}A3N5AEWSG2R
zbF`F8`BI@%1s$1ZOjPP(mpj?3Y68vYVHW8{;#NP`v(_d+lw?L-Mafbjt+rDyp7gPF
zs~)OODHxZ_<$PhLusJxN5rvhJ{eg@KYbY|XLM+l8)if%S!3o$QwnAOf1M0y$*vfX+
zr<Ag7MUz|6dv_seJAKVas-TL!_0X*1R28<0(Ge^w?7q-wEc2unLcJS>(M%g0kiHpt
zF_p!8qnMWcRBAf8dm1Y!K3Qs!hctZ3i^<^AI0aGq3J&?}_^?7Y=Fm%gDmE@!=ZRT)
zu8(adraPYi>#~z!Su7ws7wgtAN|xb@#4bfKGTkmaQt4`>)B*&|y%@OPWHmvSjngFX
z>niZWT05f%tE+~CGio?woMbqdmL`WoXNDS*4kd=A)U?&Z{kQ<>9~`@~GDTqUq^vHF
z(dnUj?C-Gdcwb9KcuhX3by%_I5Ys2+_r&9=AdeHr1(FM2U01Iy{*Rxzs76utBuX=@
zv6VRLY%k7ipsdWuL2DUwO+5%`)x*EaJKb6)u2$ZnROg)w|5cA1MEnWQQ*&cD*VLM6
z!5GtTc6I9Lm=&8aBs?pM@laGwV3y<>+~UuZ378&Lp^!<fGJU=gy<q?<SkY^;EtRpT
zo2i+;DKyXwo>34M(PKqaQJVd`4sc;?#8mdh+(h<d(lP8_bx4Y01Y?AtL_v5<DTrc>
zJO_^gGDuFcP`i#|GgCBGow3UR<{H9d`jVwGg~tSCu}B^~(yjFVtYz`czv+rqjk6hV
z6<Zrf<X7>qdM<mAeX#&jmUy%l2wBZ*7Oy3|*5-8ujjUrD^wT9Qrjet%Ks@8I7ul60
zTXNM7kr-4XQA2&LvnU~2L`x<sT50)Qq%<eq6+NZUWC(UaJ8J^HQ?PrNaAYt`k|L|}
zEO~xg@r%NR%j5==F+GZ`lPYTvOz4~o5`w0HA62mZfV{MmAz%+*k>$!ub-8uTCoY-!
zs4Tmz9nAgdxlxTP(u?riX;w;+m6nHrVCJ&-kL_Xgc%m`RuZOTEr#^@mw#N?+PS2A~
z=4!vOz5lSRLWprxURDQTwx{L`MIc402vffQc!;p{U*hkv{nwnnO8=83VLr4g*?0UH
z?54de0gionl3AfR=_Ff`q?0aDw@ZGa1i$OIWEZH55XReG&T@w0S@E;7dsc0Ily&Fr
zt~h_ISO=<Gl*`*aZ!!JVBF(y{u#Zqy$7D5%*$92MHN*11tr@ihR8})~@=;bZjAwbx
zRAL=yN7N$5E!t7Ll1i;ZX$PP>)?t@+taf}5CVApfr9-%Ej4V=wA+p4-bYyx&M`5<I
zV0}kV7t_XC;%15{QdEcKCX9;^8p%yHG@V4-)iniPh$|_OIkMzWSba<}ppwv-&O5Z|
zykj2byd#p@@9Y=GDbHSG@-cU%HH7C}$sFwB*%ztC6yK%E0-=m@;&L^>SVXR5FLJph
z-%F24a6ICRx!Nw$2jfJKn9zsVH6XU&l1Jh7Fq0L>V8rC9o)ze|gK?KcWx<R+%693h
z*|tI9A;hAR-`0s+frQTR2?tbKo&2^oYprFztx@h)_I8vlSTPLAJ8MD6aHnO0NvCF&
zjN`e4gv4{bq+zLS#e;v{T*JbI%NeUIbV)k8xuz9Le#I5*-q+%hW-Ot4cw$C4_jQ9+
za*(cEh;G)9N3gtRO?gQmv7C9KZrl1;!tmv@n;~7<nWQQB3&MF&?Z2gJaT~ITp@_bq
zrS-tbf{DG=1k@74i5SX7kQ`bio3*?YUo8p=P-ih`{6*5bJl;i$m<*}KoS3RXE3E-4
zHd(rgWf+k}SGz!oHoDk=J>)o2G%91C^e0%f11GXJsoUNpE9em|3w&UyXf7S#SA=L3
zNe|mvldW?lT9>$9{5NzoX3H}|tRAtlDB)Mt9WJL%X6PPOmS{`2J1Yej#p+T)q$pNT
zmc_kh)!g1GGM*@uJsaHNSJ#rph-#vyG|bd#SlRJ#nqcyPAD)NK2@PoC1X4)2crr+B
zhR7v0i6<`A2kS4iQGS|$%XNAoD+2P|gM4t?A0U_6RxGrS{aHP!5ib4APo?l{W{J-!
zR#n2gg-J<mUW*jYi?PUW2eEZiOO`?vlItfwD+;@OEgDHc@_T9_pDO*F35#vQ_uG+F
zv)7HN1s`S2o4?RMF;?0a=bcoM?1FliVx`kqNnR7J{4Z{jtIO71syI|fChRo1E-ABF
zPx5B;Ak3ZGex<aa_B*noo9G-ox|!m9z>cJJ@u1Q<0-AWffXNP&QuNtYEEB*;k#aia
z2h~X?Fag|qh#c5SDqMnjR(H&rNxey`!J~sJ$l)-Z+{ey)w9>qX<v7WEgdvs%O%;p8
zS4CZ=c@OElL|B#gsQm^+HVM_o#(l9szDht*m6mLxES#yl2i8*%TE^|fuqmb~rpAtI
zwKcP(L|2YQen&>R78}yS#wFYdMpNoe)crco2x&zj6^6i*430P~OCHj!(oLKa>KCnw
z_ShJL`e8EgS17<I&6b&mi;zt<Hj1S-e#MXHN0^Iwfv>lgkz@XUr%vUa<+&FekV+U5
zE>9RyJT%@@zQyN>*QrE>NuQ^N6?~WLrE;ZYkJbAEDV^*Q->lmio*HT3VA~C6lOBkR
zl<k!@utX~OHwKnWEa{FivnVzshAkg36!Z_#qk1A9*Hw0`lvQ8{wzrj^6?05W<p-=L
znc~Z_c4Y@@Mkbq@*ybp?JS+-^VKKEK!*T%&6RU_jSKA?6J&DdDU1CxE59Wq+n(ZoV
zY3>Otf^KAQb1ps+8mR|on8|54bzWElJL)QcJHgC!Wm3IT+$`8|llaV-E4LTd%a4C3
zwt_EYPo5>ei^5C^Aq>)>5Vp-sK?c^Io5=o4rx7xZtd2qRQw?@Am|Y16#)xKcIzwrO
z69R98I0Q6g3P;&6&nXJ@tU^&p1`|<v5(-iX3d77~#~M!B-Cd}`Ogtsi-)i=Y9@^v4
zeL}OlEg$1#x`bvwpsPOFi7RWKu<e^U=V42kye~PZTbmEcXMax~9g&|lt2Z^@st3yt
z*&wHj@+D`)-Uy<Mj{HWWH#h2L_RWvr@dIf%3g)8hg(UM;2#`(OYR7_X2VkEd&ko5b
z%5ai0vjQaB+gJyZeQ)lpSU|GhAS2Z~#U+cm>Tw_94JnyajAxf=L}F{3M`Ac$$l2Nk
znW{d@232QWiT!+P4z7ktesl*-z)y&w@e-%r*#s#|?XtIn8zb9ZbAHNAOcrWUp8J=^
z%MW%)1+ru>;SDvWLw@1Anrby<rIBUPF#T2Qqmh_2Tdf($dC5rOsrbDLnk1>S*ODKH
za(*jRahRqoiwL;~32GYHkr<&&7n#h+WKbS|+1MEEz<P-)Ha70ZbH%cmiwUP}3Djk3
zL&^`oAUdYaAPu0P52h3fSF$|BdCoGgPrPN_lr1H)Vxs6kgUN@z{Hask>^w4j^?OEY
z7R@OMv4*N3S-h<H$o+U_P3<HU>*T6j1q-)z@ThAl7kKP9GU2wjT)KtG^5ln@@344d
z)nHm~mHf&4MW`+DiRu&YY82(?OE$s$69B{Ou#qj)U9=$Xb#qMofqg^iXQz-zQJHI~
zPYSM4<lIA*`Jsn(dsl&I+tm0&b`Wz(Ye-KMt&!x>ZVlf^YYeUw|6o{Z0znRE)Q&>j
zh-M7+kBWSGsUFQR44LDho?M|4cpQ&MbF9q`Bm<@l|3#5j1vb57L|R3aBR3@{ly3`S
z$#nzNZfn@;DD$1rh6eJ(Q#3l;*rWdLQh&Fpznj$Gwd(Im^>>N-J6kRvE`J8fpC0n3
zqx>=DPp14qk^K2v1r%_FC|j0ecH5i&xH&90hru=41J()cv3JSQ9^at70<PdU$&R3h
z1G|zplPZ7|oDDa#EjDqp_#IpIe4}pmSgFp<O>8By$4Ty$jZXjBhzhu(nscwcRk45r
zviFz*<>}B0{lia|Z?jh6Hdke3`zoFkCCkqTRVdw6v2=HZ(mfRml9u}`+*Kf@*G7F6
zO4lk}hi#Ydvb(a<D>jk}E8a(SP4E*Hs`FN?&R3zjN{8W?Hb`e0;HoQ>eoSZl6-xJ3
z)|raKr&N`VC$3PsuVULP4zs_q(koF_C05n;*pd0$aj#TWm0DHvV{P^rRaL}Jr8Zls
zRkg%sDyZrh6jxMLr8ZlMsw%OnF2~iX`W#oQA_=Ids^e)@1CFaz4LYt?HRRv5Djp1}
zI5PjBuNr+^t!n(gYgN;at5r=su2v<v+rQ|m7D}pIp}v36td`3oJ{5{T#<BnJF|b4?
z+ZBpGp6+UeOff6e_YX$F)vVuFEdCg6{qJFLjVu>dX#MewgBy;oZLK@Lwspt9Y+H{V
zU)$Pvd~NHYf7!OS9$(wqetd0f^S@$SKQOz6UQl~(vW=bpynX%13|K0Fi;(%wyZ^k2
z9b~F2H3+Rj4ZDwnl{JwHnAiU|yoq8W_Lj8`bvGf@nUa=Kd2%LKR!d25UZ!LJRo!k=
z)xEE*;n;O2CoW~`PR{en)}0KD|F?C^vn?ope_6=cbtmV>W$I2&`pVXwOexCO{dc4X
z|LsrcX&tr*p7wz3nS*%b36e-_D90Rv84Gf7t}N@g)$*Zi(~kxqA`M7H$WHk+Ud6cc
zs`iN%2!Cn_+sDpw;i6c|U@;@orp!I;5m5_RJhm&AaqKPfpQy-6sl4zIh13}|0}%df
zsSpNNZ9$VYg#)rfEOe-m9!j*|quTGG{T^!%%SNeITUmFL_R<3R;RShGhW|^)kWR_g
zQy%G*PeR(s(_p+fzx`-8-`*h4l3FB}eT6AP>%qdazbq!ixU6x|NbZmyS5Eek8d6wE
zgepr(*zEx|o{AD{sfekRBvVlOjpF?Zs3k!rm44MxZj!GO=|$;RTNO)IP=<E2n;uq8
zqv<Vs@&{@(RT`We*D&MCMnLte&~Enjsg(;uX((03H7tX}Yf9yV<YhZBzdg1Gr>EQo
zf8iJTOQ7QKTJ@KNi1+Y|{Drs^evv;z<PVP<Wg9K!k12oPrRbp0;IjE2rp_p3`HUpX
z$2W@QV`awS7wyi=&FVi6al_Yfv$ynS2ziGh<QsK!5-JCjQsrKrY7=_VC-?HJdI@?_
z7U67qk#E$!Njx5~N0EEguIMmq8~5@b+9d8}E0J?A->7?&KI4Er&ce|JlJ=1-PyUyf
z3cX7@7N^BYZ|JnbNZL|q$-xTeq?<L$Ohp{U8A)5Sl~GDe*$pZslS+#YmA|Jziq1rD
z6<S@&6xl`dyORz_Y8RdK-(~bsabU|4ut^m)Fhdn93zTmYdo;>dR+)+__D%ZHggYzr
zqoRsJ6^ag2&~$n3)d4O=XR|(4p=f@w!BI)&DhgFnbR{aP#ERM;JJc2JzM?HxrlQKM
zsQIx&T~S394YabMuFQ%${=My2w4%yTRf&o!+BM_R=lEGsm*ZzeY>4{ztZ2aTv!X%A
z&x!{An^rXX_*v1!<7Y+V|4l2Je*COx>hZIp+5e^$LE{~_kOl*J++tdZ?B4y?f*R~#
zDVORbF#l9kgBm<0gZZbz8q4f|o74SsaSht;xY-hn<8iYk_`rYBmS6*qn=L^F9yeQJ
z`^LX$OVD)3&6Z#okDD!RKAyJZ>>E0;H_85yO~U^DL-xcLq(sre`%yN$-#=te%t^&t
zC(0MrkMei@qc-If4(s-Jb8*GCNCRN}Y%_E2#W0RlvP}MyEf>uMWQtv(JO5R!JZvnr
z!u8n>Cs8Y8Px%7bgBGe-t2`0#Kh+A!Ahp8s*=C=pHB?Ehfr_;%4&>Or{vA~Uzn5gC
zV7l0UXp=gMv^$o0<GCYwE*iEAwg~18R*3(Bf=@h5?XYehhjnWxzq-vYrs+JEX~4H>
z5jafQA$f>b`L&S)ysG~*H=DP7$pxYLh8}Pp$Ye{XY&JY3J4V@`bwFpgB4l5LpJtGa
zq4JaL{8Bo!B3nUOw2So5VsE(oz(1Sq`Cl8QTk`Y^8^ECvWy7LuuSY?OwSp&4vR3RM
zgfz6E5!I-r_08fF7xEaUm1h6o3%^rJo&1~*4+pCs&yicyLq=@4WD~1xCMLfysmu04
zyPfi&IG6DwNNUfWv@4fwuR>n3qgl4la;@sP$#X?$MfgnU%Fx!K0e%?9A3DnBLi?8`
z{DD9)7z%|sQaDmM(l{c{k?tHB&QZlVGAkTaD;(7-9MvlvHU8$P`8P+czd5r0^HE!~
zt_X&Xx<m_ZkH^d5<M2C2z&U~y4j%X|cZk;h(;*uDPlssx|K*U8`hPjJlAO)v+TnX5
zD+cUrzH#aS|B@ctvhNtbug2;@@3wtx`eFCNE?XO{8@)Gk<-lDnA1XW)UfySWlMNI1
z*IqMZcgIb$j~MeiKO4Jc<fj=|p7qA559I$4T-xj9lkSRtUF*8R@3((q=5O9b-Ck&P
z``9n4U32c+tsg1+IrXxBJ5IiL@^^J_9Qx5|PtW~RpJ#1}-aO)y^efJM{gnIj4hAmi
z`O=AZPWY<kwdcRr?(rGFdM@tze8bzud|vhHbKYw8@U)*&F75kj(|aa;Tjz!gKkW3>
zoIhL(&Umi=t)o7xa@E;yws^4M$I!CguQa|p_nWNiFZiItle2#JE$;r}3G2swS$)-c
z@3eXJqF>T3@Bdn}`=)$f@1|iNpZ-koQO#PL8gdyfAHS278oF1PgMG1vi^I*~;n18z
zzLB$HEn+DN+@z#vn$9%_;zHbr+~h+72w!XINKGV)oPt=$2xK0z1<`fhBRYlX6rxjz
z?nV4a7>OV?5EE&R<RHUYHc1QRm#vdqRK47$>JIW6LZl68h_s^$QXR=c>LK-!#z+f<
z-R#;K$l1s!qySlf+=@Jhe1;foE<X?nz3sIz;`jS}0w1qel=YW~X!CClueX}nsB!Za
zt=o0zlw+OIwOfy#y?XcQ+ke1-GtWAE;5p}<d)}Zy=bu0Lf(wQWx$wfF!-fqXK77Q8
zkt0Wq8Z~<Km@#9<jvYI0-1zb1Cve2$@!Z_pi5!zSCQqI`g=4B5c^vt2z?2pg6v{D8
zjv{qjB*%1h%&?D{>X>C8v+ZM!bIi4mV&{<Gk5R|G#4$f{EJz*;lgGu$W07-AibTv4
zo40D$F{g9a?mc_==_ehRZqsS{Jd_@*{?c2!uhLP~O{b4`|D4X*J(F%FIwd_yc1LwZ
zbwu?-^`UI@?WQ~Jwj1p<IMLk2&auQfE^&^fiDOyfSe`g8O&*t(9+xMN6&IRj^L9C1
zd-fS{);Z^$f5DKUba>R5vE#;1h%=g#CNqYN9^)nhCL<*snl+2Q%$Y+sii_z1%BStp
za9T~1rMc2lX{EGKwN6^58l@Vf+M-&qLLFDA<4Sd0WgjcmakYJ1;~cA;W3_W!>m1i5
zj_VW0n#6HK^0=||xG8z89muaORu5H)q-SV;U%(smgxq0Qijk_PY1YrxL%;RIOB~~B
z{tEVZ4h8*GUIqSm5#<6hc$K4kSvqb@$9)*M4ukt#T<gMZxp7-=+?EHo<-u)vaa&&8
zmJhe(!)^I-TZk)w+d{lS+!hiJ;kJ+({EC%{G*7{8A;S|w14?KEVjwQWjd&0*;zRsM
z00|-?B#fjWsYn_ULDG>7qzaOWR7I*G)sY%VO{5l*h15psAa#*?h>1jz7*Zc;fHXuJ
zAtxZ&$ce~FNMoc4(iAxvX@)dMPC-sZS|BZvR!D244bm2AhqOmJARUoT$Z5#wNDg8l
zoslz;E=X6T8`2%=f%HUrA-$14NMEEM(jOUsoQa%;oQ(`b&Oy#a&O-(v=OcrW3y>km
zg~(837&06gfs8~(A)}Eo$XH|?G9H<L#F1QNA~Fe?j7&kMB6&zYQh*d9(~u(MB4j!;
z1DT1;LS`d#khw@PQi9Aw<|7M`g~-LoB4jbL1i1uRiY!BxBbOqVA(ta7kSmZYk*koE
z$koU-$SPztaxHQlay_yJxdFKmxd~Z|+>G3U+={G2ZbNQI)+2WycOrKocOx5+dyspP
z`;d*u{m28zgUCb3!^k7ZqsU{(<H!@plgK9IDdcJ78DulE1$h>E4%v!4kGz1qh-^b%
zLS9B*LAE2WB0G@Rke$fu$Q#I;$S&k9<Za{~<Xz-F<bC7=WH<65@)7bevIqGD`4ssK
z*^7LRe1Uw4>_fgnzDB-5_9NdS-yz>42atov56F+mA>=3IXXF>;F!C$%8}d7H1o;E`
z6FFL<t<^YOh#T=BUc`s^kpO}hA$`&DMLPaV#~&G_F9zw0LHc5lzPLzVT%<29(ia!$
zi<|VtP5R;{eQ}e%ct~G7q%R)Q7f&L6krW0QHT$AjKjcn%zPUcG@t^P9UG(z&m*)I-
ze)j|S?7HQ?HS6*>Kl;a2jar}n%l4MHeSR?S-BYT%U#$OJjcYvD`7%mA{pN%RrUe!>
zsg*T(@Y=IFy#3&bpEX-}_UqcW=8^j^h`;i5yIy?;y|(lFyM8~TZ;Ok&{%RBt=y=l|
zXWp7~`IR@c{b|!9??)p?e|+QBaFzYTR&}Yqb^Y8<qc5ykFnnJB-LaEaEa~>$W0@Zf
z4SlfZ)b_!i-UdUSeRIReE3V!<<I8PrCSCegzdI+Md+6&iGw=03F}>l*wQqmug?%3u
z*6aP$vZYf`dw6UQ*UzaVTAla!qAhi9{N(WJ^a<}>w(|PAi@(}&PWH#kZ@whunJ+qz
zx2|oPwsGK-)le{cK#leQh|(j7iR2(9NEXr>QOAJwht52Ej5YVUb=7NKGxUV^ujJm{
z;b7Mr>fJr)w+&ahKI?tXZ_Ul0Cl%duMbY*p4f_~B9eL;T{_nhU((@%<S~a@-){zhQ
z9DM&r<FcM;xH|RguAglF^Fm`<v*$7n^qF_-s&1FQ@avnoUk<z6b@iGnt9~<c(lhU$
zf7aDa7d*Y|&1m7E6DREH*=5^Hm&Gr6cG~nAmmSJIZP2wJ<~@GGBmF;ly81m!4vgJk
zzVuN09dl~*`gqj*Q)+!ayv3QTx)z^3a?eGh?_YRt?vu6G&iiv^WB;#%#`l<7^N+5V
zFP`|(%0~~+I;;Mh1$TdH^v*x;@_L`0boV)9ec9hU;ClAqX4lPne$j3IHgEp0IJ@}+
zw>?w;iMY!cGQZlljcdQQ^vZ{3tUvIB@5h@zozU;qL+_7y>uHa--pH1(Fa7iOtv8%~
za?7>n{CLZ$pDguE`{w6$?e;GS-Sz$O9PdqeL(hy%e(Kky(bGD4KfLB%&uJfad+Lk6
z4_`X)3eSaG^QP?bx{p>JFucL<t81?M>Z!N4pOJIPMK``T?y1iEHZECqiLdPiHNX7e
zyKOVN2WQ{>Ra)oSFV$UmSO#RlqLM(5ZyTSwVQPBae))qZ{(dy|k8|5zoL2X(dpn)b
zu4(Jh@1+H!w+|Wr`oIgWZ+~*@2KSE(hSR=0@2WSh`R<_`ued&||MQzmZZUU{n9%#8
z!PRPI)DAvz)u*4Iw083udtx2yRL{x?r36y%d2!hfkKge~(`|Rw&Ahng!Snj~rrr1C
zhu41h$ME7?Z)&9fvgxP%A<uUh+ok2)-!CXS@r-4=U#S1e_#@vwGV0*o$6nrj$~%#Z
zM|IALt*T<Czp!vlgI*UO?)2mv>8JjB@1&bge6e@48|(bo{DIy3&svnS^Rq1%O|14{
z<7*$EdgpIFR;I^Bf7N~K@*{WDI{m=fcbm*P_xlwsVlVdVxp>R#ZQuLzmY0{e4((d~
z+Pt4`&;DrUZH+E_^!;_u+;jTg8`?KXxpMk|>$eY!kL)&g;ImaO{dwPG@2(j7OS_YQ
zy6?I9PnTSH)9kBuPPnt_TT>p)X!u%(Yd36LKc_}y@~AcYW<A;AgTVAguO6y#{$%~p
z>-OJqUH=7RCSU!<+%Jya<$rBc^oxB}zkl!gFQ3@adP=Wdft$X5d-lc??|Wd~DW}vw
z{iNnAZ+l_$*jrk)+0?J__Lt`$*|>AXo~rMzZ_#VD``|C{@3&G1yjtAs=PRmT`gY%<
zz8k+kV_@IAwtwutX76VYzkXxZl6arT>NVL?eCC*L?yra5aAB+EdO_jh-%hqdzYnf6
z?!mL(_~6PG=SIG&(*5m8%SXI(-OTXKZ7$0^eEuaJt9<j_33u*j_0ot(zx}-SCmrwa
zKJ)yK-&k{c!4K!m*pXS|nKmyLeEsS#XJ2sFAHy=IbozMF2VJJm`|-Y)Pun{F@g^-B
zv>ldn>$CaK)u|S4`01u)Jv(MwSEn4!)9+~a(Su?Cn!OLbyJE|^qem=!W!BV3PJOFz
z$iyC<&wFg=s8KK6HhcErg%@4)-L<o3{e0;UKO9ZT&3*f+DN{bW|M};y7}&h|@Bu<A
zjT^V)&c=<;o%83PU)Fr*org!>eRtn4ii-AM_wvh^cbz}K-VHCkbV<&fIlnFb?YHl%
zo`3!W-*oSO*-Hlw{2sdJ9&7)uT@Q}C<(6T8+;?Bsuh*=p|IE5|XaAC)zxSHWo3Fp%
z(MP8}`^O)j)V}JfPWLuy)F<A$^~AxapFY?6<(Hp~?b{dhZ`pGEklSuM^U&v?zuxlT
z!C(D(d7obO?z?vvoN~&@v#VA;>2$Z-AAa%0tIw)mzuTDSp1WaajT+6fueqkp%^puM
z?YirlJ?!(P1~M`lv@R*hy8hEoKkoF+H?KB2;e>t@9(bVm)oIhdz9|sM$Xc+V&W%l)
z3@WTutL4dAS*@E+p1k{k!Gj<AZtdC&en0E1bw79LP}KMBw;vtx;Dd8^oOt3{vp)Om
z!_%5I8=ALpVU<;9pMCpJufP8Ic){+s-)?DQnmNOd9Qmx?{rAt<cEJUYd>fDNdis@D
zF75X8)8pQ2*Dk+zuU;!Z>eFZSu0ew~?tAUEi+k<d`DE_*-#>rSU3c~W{P*9#t#-y4
zi=OY>_qumlv>1EN#TTcq?AmqdYrp>bRm3pd-s0jv=M5OJ_CUvu)4SYsQ^P0jxTE)H
zXP$Y>kGI}>;h#A<C7Un5+_Ut`D^I`Yh8v=r+P0nA<ENhvx;Jea_3<N*T(sr=_wSn)
zjdmWzY_$2&qo3FM@y9<>-+1GhNw2;-uTMB!BeP1CMy>Ymf3DfEVNZXuYE`RSx^!8x
zy?XVN+il%?)p_gJpYy}qxku)A>NK<a=+WCYTzKIVU)HR7YU6@}FRmUweDm6Q^J=c{
z-~YzfckjNnI2P+N_N0>r&Rwyh!`(}kgcfw`w*2MqzI*Y+#~#alGcz;0<3}H@pD}dk
zrcXnmYIQ#N;GUvAdp_uJ>Zzm8ZQs7IUoe<iw`b2QKk$0P!3GU_OdK-gvAxeed((w)
zzWLPT4I9q*e&oolw_kBZyF0GF`jmBh_rBR?#*819efi~Er*7M}?2I;TCZ9iP()*8I
zda1trt+(zU-LK!8H}1T%=bniZ-+k!Zb2t2U=+KY8ufKk+;g~Tm-#2sSq08>QxBFLq
zf276}PfU4h`t$=!8aC`T<>Zrx<kzm<w!!VUpLg(~hYDVN;e}Oa?%VfD{SQC9d3Isp
zzBTpgofhrg`<f4*dTPY(Wy?|)EnS*+@zkk%R-AU)?9LBAoVRuC*zI@q=&|Bmm&@z<
z`R8Bs)YRG;BSt)XYpYiAVdtH9@2`(PK4IsgMb)p^vgO)Ab?US~;l>+Jc>I%3-fMgK
z@HbUfuWtNcdU~wIgb8mv`QCeXPQC0h-z6(owpf4t^-UkCTeo9u@#3mie)ZK$=8heU
zd!BR7T}QLC2TcF?<J)E~U+%j6=9|wye90w&l9ZHM)t`B0%==$_@y;ooJ1=};{P>-Z
zS=RjLuD$lejZK>lz9=oNZu-WJvtAiE@Qx!-J~`>_YSo&wC;Sf}{GUzue~$2f4dMS4
z!v8_S|J{WDs|f$+5dMEA{9i%%Z%Ft*Lim4&@Lxjsznt*jlkh)|@V}by{|VvWApB<#
z{%<Axzd-o^lJI{u;r|=L|NDghriA}D3I8V&{<{$V<And|g#TQ^|A&PCM+pDb3IAgW
z{|^!VYY_hD6aI%2{<{+X_YnRU68>uw{#O$I2NC{j68;wx{vRd$*C+gcO87sI@P88F
z-$(d&5&o|u{NG0S|AFxT0O9`$!v7G$e`CV`m4yETg#Vif|F07M-y;0iBm6HV{NF(M
zUrYEumGD1}@ZXN`e;483OZY#N@c%2}zZ2pAUc!Gj!vDjBe-Gh*3gQ1K;lBale-+_>
zJK_H#!v9l*|0RU~wuJu=2>;y)|6dXQUlJce_+LQy??L$AK=|)R`2U^oe=gy_F5&+K
z!vAQ(f0Xe5I^n-P;s1WZe;VQc4Z{D8g#Z48|62(E6A1s+2>(IC|L27NGYJ262>&6%
z|2>5N#|i)22>%xo{`(OApCtVMLHNIk@V|-h|2*NpCE>q_@V}ezKc4VEitzt3;Xgw7
z&msJqg#S5&|HFj;bi)56!hdhV|Br<K{e=IWg#U?z|7!{VzY+dpg#WFC|5}9qcM1RB
z6aHT${BI%r|4I07P56I}@SjciZ$$WCNBG}M_)j7HUr+cSN%()3@V}4nKa}wQ6XE}9
z!vAc-|DA;Y2MPZj2><H||C0&-vk3nI!v7({zfSnSgYZ9w@IROE?<f3!LHK`<@V|rb
zzl-qyHsSw1!v86R|K^1M&4m9pg#X(K{~HPaRSEyS2>-tj{!<D6%?SUO68`%V{s$8N
zKPLQtM)=Pn{69wcFDCrE3I7)o{tF2IClme$6aLR4{I?+dS0Vf_C;ZPO{9i`+zl8Ar
z9pS$f;s0C0e@DXq`Go(|3I8()|IZNqzb5=&K={uj{4XN>&m;VwM)+?+_#a02&nNtc
z3IEFo{}$mtkMREy;eQR`e+A)x1mS-w;lGgZ-x>I?2mWJ#|2p756!_-?|E|D)5b$>a
z|KEWBB;a2J{QCg^cYyyZz`qOdzZLio2L4&VKNa|I2L984{{i5?3i$sD{D%SmHNbx+
z@IN2;F97~g;6DNQZv+090RI`l|1{vA2mJd3|9gP{2H@Ww`1b<-rvU#Jz`q#yUj+Q`
z1^#P+e`DZ39{B$O{3inc!@&Pd;BNr`%Ypyhz&{)KKMVY40e?U6Ukv<j1O9Q~KOgwl
z2L2BL{~v(=r@;RZ@P8Wkj|BdI0{^pt|2e?_6X5?1@ZS&ozX$#|0sjc_Ukd!az~2M>
zp921u0{;tv|1RKP75M)S{J#SJX8`{jfq!S<zYO?a0Q|oL{z2fM2K?&^{(=7@;QuY~
zp9=i*f&Wq9-xm141^n9q|M!6Z?ZAH^@IM*&j|2W+0{?4({}sUhdEjpX|K7mA7Vv)p
z_@4y)_W=Luz&{1}zX<&A0RDFZ|C+$x2mC(-{=<QPBjEoN@b3Wp=K}u|f&UA@{|N9u
z2>f>g|BHct4EU!5{|3On6YxJ3_}>itn*smk!2c}Z{~7SF2K*lf{yl*IXyCsb_@55^
zn*je6z`q~xe;xSW0{lb3e;)Av2>4$H{GS2-Hvs=Df&X^k-wpUz0sfBx|6hRreZap2
z_+JJ5n*#p~;D0Ufp9B0y0skj~|8(GA1Nc7*{I3K4lY##i!2dPizYq9d5BysL{{Zlx
z4g4Pf{`G<XO5i^h_-_LKF9ZLb!2ey~zZ&?z5BvuJ|DS>X+rWP#@b3%!-N650;J*a;
z*8~1%0{^dpe=Fc$2>dPJUkCWV0sPMe{@sE92;d(E{+YnPBk(@~_`d}FKL`Hz1OJbK
ze*y5{0sPwl|5t(kUBG_|@c#h#{|NlI0{@o4KL_|f2mBiX|DM4AYT&N}{|ABpUf{n4
z_%8(hj{yH6!2dkpe+c+nz`r{1KLPmP4g7Ba{=WhL&wzh(;9ms%w*!9z_<s)kPXhj}
zfd5F~e?Rbl0{DLo{Qm_0&47O&;NK1SzX|*=2mV(A|4G3AEa3k%@Gk`Zdw~B-!2em`
ze;M!}1pFTd{+|H<CBWYV{yTvG$H2c9@IMpypAGy+1OHs$KM(l(fqxI+-xc_O1pH?K
z{{rCO8~E1){^tPyZ-D>9!2fyR-v;<+1OI1$zYF+R1OBf8{~5sF2mB`h|M!8v2l%%H
z{<j1FlY#$_z<(+5{~7o%0RF>)e;)9k4E&?O|3lz^8u0%D_zwjBTY<k9_zwX7tAYPh
zz&{80zX$yH0sl*Ye@)=O4fxLn{+)sULcu@qF9H6Ifqy#i9}N6cf&az8|6btV6!@nB
z{~^Ht0^r{o_y>XidBFcW;D0^v-wgbB1OJPFe+KZs3iz)D{xRU61^ffRe;M$91o+nl
z{s)2oG~j<N@Gl1bI`Gd2{$qjv@4){I;9np3e+&Hg0{>Hh|0v+U3i!VO{CffaCxQR3
z!2d+xe<SdJ0QfHg{#$_mgTQ|(@Lvi1zXJY8fd2vDKL_}?0RBCJe_P=HGVtF8{C@)e
zGlBo3!2cfL-yZl+2mZr=|6JgIDe!+6__qW8&jJ4nf&Wh6{}%9X2>dqy{~Ex54e;*(
z{2KxP^MU_$z`sB6zZ&=-1^%0We^ucBCGejD{BHvO8-f2i;D0*szYX}`0{r^{|M|dw
z1@K=F{N2ERKk$DQ_+J71`vU*(f&X2={~F-`I`EGJ|0ckH4DcTc{Fej&#lSxV{Ko<R
z4}gCJ_`ePO-vRzN1OLOozY6fb6Znq+{<VRBci{g9@c#k$X9E8hf&VYS{}15b3HWyb
z{`Ud@@xZ?U@V^!K*8%>Y0{@P{KLz;T0sO<j{~_RiF7ST^_@4^=Cj$S+Zi4_=2mx>{
z1i+;b04WdvPeB0O4*@U`0$>0HfC~cPP6&WG5CAnH07gOpd;tM)9RxsE2!I<P0CFGz
z7DE73g#h>l0^lVGfDi=0eh7eZ5CDHb0DKJr@C*dNFAxCNKmc3-0q`sYKy3(sdm#Yg
z5CDTA04xXq0|KBw1i%mofI|=fEg=B>5CB&}02Dv~oDBhRIs`x%0^lqNfH4pNLm>dN
zApmZM07!!Xco+g800Gb%0^oWGfKCtqjUWIfKmfc70dNxpKo$hRjSv8Z5CA7b05pXF
zcmM+6I|zW^Apm}c0O$(=FaiQ#2L!+@2!PWd0P-LJRzU##1OYG}0zk0a1Oi|<1VB9q
zfNc-}-$DR94FS*%0^mIefZh-QA3*@@f&kbD0niHqAQuAQBnW`dApojD06Y%?@D2pP
zIS>FVApl;30Ej>Ucp(7hK>!?p0O$e%@B{?FXAl5CLIC^;0k9bYU?~K^JrDq!AOLzm
z0JtFlK866;0s$}$0$>ybKywIyS`YxK5CD@P0Qx`xWI_P6f&gd+0q_X~z%38}+aUnj
zK>(Zw0q_F^z<daR?hpVQAOOCE0B8&Wa5V(LS_pvE5CE@302D(2jD-N03juI91i%6a
zfR`ZvPJ{q>69S+k1i%akfKMR+>OcS#K>&1s05}%{pdSQ4T?l{=AOM08023hq_Cf$$
z2mvq|0^oZHfZHJe?tlPT2LaFq0$>>gz^M=bXFvd)4*~Ef1i*3#fYA^DZ$JR-fdF_2
z0^m0Y03QTELkNKTAOJ3d0Qd?5paulMTMz(CAONO70OUgeG=Km&2m$aS1i+aP0QDgN
zW<vn1fdGg?0DK4mup0tk5d^@+5CAJ606IeeY=r>03j*L>2mlWRfDQqW0ReC;1i&x|
zfL|d1c0vGL0Rb=w0^kG)fX5*K+Cl(SfdF_A0-yy1z>^RFQy~B@fdE(!0q_U}Knw!l
zN(cZG0-z@Zz)=W*=@0-jApkCi05}W*Pyzu^9RlEe2!K-{0A7FqcnkvIIS7D_5C9iJ
z0Hi|zyaEAm1Onh~2!Qqw0MLJ<p#Nq=|6K(AHw*gj2k5_C=)Wn@f6qhzHHZGwq5sA~
z|22mG`}3WLN4^97cQ^E35%k~7(0}ux|5#{QJO}#kH|W3fq5rx={~dt-y9fGj7xdpP
z(0}(q|E+=kTL=A@5B;|p`tMQbzdxY=u7dt+1pU_<`tNk;zh9vLwnP85g#Nn?`tNh-
zzk|?!dC-6FLjRot{Z|$G&kg<eBJ^K<=)dQn|7t-0T?75+f&RM=`p*abmjV4(0{!<X
z^xrqoe<wiyJplbT4f-zt{kH)6uL<;DE$F{2=)cL(e}kd_)<XZC1^w3n`tNP%zXzfJ
zPK5sZ4EnDb^xs10zq6tLUKjjB|9uPnXF~rSf&RN6`tJhhzc}>YE6{&WL;tmd{_6$(
z*9ZD<5cJ<`(0@Cj|GtO*y9@g7cj&(}p#S<p|FwYryBPYfEA-#5(0>N>UorIG0O-Gt
z(0?~U|J?!ocP8}Tt<Zlt(0`Xh|6K|FcLVfaTj;-^p#L^O|2+cz_dfJr6#6d${dW}l
z??>prH=zGsh5ide|5bth+YkLW4Ek>s^j{a~zv|F`TcQ8fL;uZ%{_6z&HyZlyLg>Gm
z(0>Kcf5V~w=0X4UhyL3Q{TGA&I|=%41@zw%=)Z2zf8Rm>JqG=k3H|pG^xshEzYz4_
z2he|ep#M&V{%a5Y7li)n3H|4V{%ZjJHw60cS?Iqvq5n2O|BZzHy8`;}YUsbc(0?<a
z|GtF&+XnsD2KsLj^xvh>e{Vtm^@IMq6Z&r=^xwJAe}|y|zJ~rA1N}D>`tM%oKR@)}
z6VQLtq5m2}|D6o|R~!28cIdx{p#NTg{@Vxr_aXFOA@pB8=)d03e@{XGErb4B3jH?~
z`tLO8zlWj!#zOz~fc|qq|NRX8mkRwi0{X8N^xt{Ve~&}|ErR~r0{vG9`tL^QzfYk5
z4nzN~hW<;3{+j^(_a5}$Wzc^sq5rOj{;Lc9w;1~GE9k!+(0}JZ|7An}eGL7#9QyBO
z=)X&#|5Bj;o`L@R0{X8r^xt^sKMVTrTIj!~(0^&re;c9y215Tm3H?_M_>Thqvw{Cb
zz<(C-{{i^t0{<z%|9Rlw9Qf<Ne;n{{4E+BD{_g<)yMccZ@P8Tj&j<c50slF`|2N=&
zKJf1j{0{*Cdw~Bg;C~D7zYqAY0siZNe?IWv4E!Gj{(k`ftAKwa;NKefpAP(g0sh;8
ze@ozh8}R=e_#XuRdBFc&;C~A6uL}I#!2d<yUmy5C2mEUQ|7(E12l!tH{C&Vb1NfH!
z|4)JcH^Bb{;Qs*dp9cH`z<&YoZvy;l0sk!EKN<KB2L5Y-|5?Dl1Mq(v_&*5zPXzv-
z0sm&ee<AQc8~7{y1OIP<zX|-00RQ`e{{_H54*XvM{!at{cEG<E@b3fs2Lb=rfd5Y5
z|2^=(3;6#I{LcXXeSv=q;D0gj?+W~X1^x!`F9!YtfPY8ee-rS(1Nff_{BH&RIl%vN
z;D06XzXABS1^zz)|4qRE5#awm@Q(uj2=G4&{C@=gZvg*SfqxkIR{{R}f&VbzzY6$w
z0shs2|5o6?9{A4%{+)pTXyAV#@UIE{3xNM{;6D%e_XqyFfqxA6p9K6@0RJVxzZ>xX
z4){L?{4;_7N5Fq5@DBn14}kw3;D0LcZx8%~z`rN(_X7V0z<&tve-`+^3H&zz|B=A|
z3gCY=@ZSskX8`{%f&Vt(-v;<k0{)i*|F?jDKj42S@Sh0$&jtR6fdALPe+=-S3H<K`
z{(j*91n{2@{2K!QlYxJ2;D0;te+c-$0Q~m>{||wGA@HvU{Cflcr-1)5;J+03PX+#`
z0sn`A|5)JP1Nggu|Iffb75I+;{;h!jdBFd1;J*m?Zvp;wfd7ra{}bSU82GOS{^`Jf
z0`Pwi_+JM6R|5a*fqz}#zZm#`1^jma|8szUHt_!#_%8?kHv|7mfPV_`e+Kw}0sK1y
z|M9@z0{+(m|E9n{4ft;a{sV#klfb{)S%LUl&0F8r_L-bpZ=CnzJHOBVtlj+Un;l);
z^0G6!Z2jn~R|d{ndFt)Y*6;LPpQ}Tie|xT<|Kyv;X1~|qfuaj;={afH>kab{|2%rp
z_|?^RtsAoa!ZD9rIdN)_0e^h^`1?bvb~|tVr8g{G@$TK)@=b3Bf7v_8^Vox3y*r!K
zz5M0V(~K?mJ>}A?EX{3yMY`30-0rVe{qWk30}p*saLtCo*q8UTy7<#y+w7}7?c>Sa
z57x@7bK>)tOnG!<>RmOS&N_1LX}w=~V}^O%2b=$#{_5aE^*+3##S-6(r`(?rIsb%`
z{lkWzS$t>BhkK<o{e0$4SG~06`?Y&I^u70_pEhp$=E;r=&c1d;^~UbGPh9kNqfaJe
zjyfkAp2N1+FSL-Wq=l}vODi0aKW<Fnu(4xC6i&+@Q{ZZxHfdVnh>;V=G#xi_%E%ED
f3mS(;PMet9bkdkfQ}Smt_7_YTH@WGU>4pCXO6?<3

literal 0
HcmV?d00001

diff --git a/packages/wasm/lib/utils.c b/packages/wasm/lib/utils.c
new file mode 100644
index 00000000..8ba2cd36
--- /dev/null
+++ b/packages/wasm/lib/utils.c
@@ -0,0 +1,137 @@
+/*
+ * utils.c - utility functions for libdeflate
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+
+extern unsigned char __heap_base;
+static size_t __heap_tail = (size_t) &__heap_base;
+static size_t __heap_mark = (size_t) &__heap_base;
+
+#define memory_size() __builtin_wasm_memory_size(0)
+
+#define memory_grow(delta) __builtin_wasm_memory_grow(0, delta)
+
+enum {
+	_mem_flag_used = 0xbf82583a,
+	_mem_flag_free = 0xab34d705
+};
+
+__attribute__((visibility("default"))) void* __malloc(size_t n) {
+	n += (8 - (n % 4)) % 4;
+	// check if size is enough
+	size_t total = __heap_tail + n + 3 * sizeof(size_t);
+	size_t size = memory_size() << 16;
+	if (total > size) {
+		memory_grow((total >> 16) - (size >> 16) + 1);
+	}
+	unsigned int r = __heap_tail;
+	*((size_t*) r) = n;
+	r += sizeof(size_t);
+	*((size_t*) r) =_mem_flag_used;
+	r += sizeof(size_t);
+	__heap_tail = r + n;
+	*((size_t*) __heap_tail) = n;
+	__heap_tail += sizeof(size_t);
+	return (void*) r;
+}
+
+__attribute__((visibility("default"))) void __free(void* p) {
+	size_t n;
+	// null case
+	if (!p) return;
+	size_t r=(size_t)p;
+	r -= sizeof(size_t);
+	// already free
+	if (*((size_t*) r) != _mem_flag_used) {
+		return;
+	}
+	// mark it as free
+	size_t flag = _mem_flag_free;
+	*((size_t*) r) = flag;
+	// calc ptr_tail
+	r -= sizeof(size_t);
+	n = *(size_t*) r; // size of current block
+	size_t ptr_tail = ((size_t) p) + n + sizeof(size_t);
+	// if not at tail return without moving __heap_tail
+	if (__heap_tail != ptr_tail) {
+		return;
+	}
+	__heap_tail = r;
+	while (r > (size_t) &__heap_base) {
+		r -= sizeof(size_t);
+		n = *(size_t*) r; // size of previous block
+		r -= n;
+		r -= sizeof(size_t);
+		flag = *((size_t*) r);
+		if (flag != _mem_flag_free) break;
+		r -= sizeof(size_t);
+		n = *(size_t*) r; // size of current block
+		__heap_tail = r;
+	}
+}
+
+void *
+libdeflate_aligned_malloc(size_t alignment, size_t size)
+{
+	void *ptr = __malloc(sizeof(void *) + alignment - 1 + size);
+
+	if (ptr) {
+		void *orig_ptr = ptr;
+
+		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
+		((void **)ptr)[-1] = orig_ptr;
+	}
+	return ptr;
+}
+
+void
+libdeflate_aligned_free(void *ptr)
+{
+	__free((((void **)ptr)[-1]));
+}
+
+
+#ifdef LOGGING
+char* __debug_log = 0;
+char __debug_log_pos = 0;
+__attribute__((visibility("default"))) char* __get_debug_log() {
+    return __debug_log;
+}
+
+void __debug(char* str) {
+    if (!__debug_log) {
+        __debug_log = __malloc(1024);
+    }
+
+    int i = 0;
+    while (str[i] != '\0') {
+        __debug_log[__debug_log_pos++] = str[i++];
+    }
+    __debug_log[__debug_log_pos++] = '\n';
+    __debug_log[__debug_log_pos] = '\0';
+}
+#endif
diff --git a/packages/wasm/package.json b/packages/wasm/package.json
new file mode 100644
index 00000000..94082a58
--- /dev/null
+++ b/packages/wasm/package.json
@@ -0,0 +1,29 @@
+{
+    "name": "@mtcute/wasm",
+    "private": true,
+    "version": "0.1.0",
+    "description": "WASM implementation of common algorithms used in Telegram",
+    "author": "Alina Sireneva <alina@tei.su>",
+    "license": "MIT",
+    "main": "src/index.ts",
+    "type": "module",
+    "scripts": {
+        "test": "mocha \"tests/**/*.spec.ts\"",
+        "docs": "typedoc",
+        "build": "pnpm run -w build-package wasm",
+        "build:wasm": "docker build --output=lib --target=binaries lib"
+    },
+    "browser": {
+        "./cjs/init.js": "./cjs/init.web.js",
+        "./esm/init.js": "./esm/init.web.js"
+    },
+    "distOnlyFields": {
+        "exports": {
+            ".": {
+                "import": "./esm/index.js",
+                "require": "./cjs/index.js"
+            },
+            "./mtcute.wasm": "./mtcute.wasm"
+        }
+    }
+}
diff --git a/packages/wasm/src/index.ts b/packages/wasm/src/index.ts
new file mode 100644
index 00000000..21a5080d
--- /dev/null
+++ b/packages/wasm/src/index.ts
@@ -0,0 +1,213 @@
+import { loadWasmBinary } from './init.js'
+import { InitInput, MtcuteWasmModule, SyncInitInput } from './types.js'
+
+export * from './types.js'
+
+let wasm!: MtcuteWasmModule
+let compressor!: number
+let decompressor!: number
+let cachedUint8Memory: Uint8Array | null = null
+
+function initCommon() {
+    compressor = wasm.libdeflate_alloc_compressor(6)
+    decompressor = wasm.libdeflate_alloc_decompressor()
+}
+
+function getUint8Memory() {
+    if (cachedUint8Memory === null || cachedUint8Memory.byteLength === 0) {
+        cachedUint8Memory = new Uint8Array(wasm.memory.buffer)
+    }
+
+    return cachedUint8Memory
+}
+
+/**
+ * Init the WASM blob synchronously (e.g. by passing a `WebAssembly.Module` instance)
+ */
+export function initSync(module: SyncInitInput): void {
+    if (wasm !== undefined) return
+
+    if (!(module instanceof WebAssembly.Module)) {
+        module = new WebAssembly.Module(module)
+    }
+
+    const instance = new WebAssembly.Instance(module)
+
+    wasm = instance.exports as unknown as MtcuteWasmModule
+    initCommon()
+}
+
+/**
+ * Init the WASM blob asynchronously (e.g. by passing a URL to the WASM file)
+ *
+ * By default, will try to determine the best way to load the WASM file automatically.
+ */
+export async function initAsync(input?: InitInput): Promise<void> {
+    if (wasm !== undefined) return
+    const instance = await loadWasmBinary(input)
+
+    wasm = instance.exports as unknown as MtcuteWasmModule
+    initCommon()
+}
+
+/**
+ * Deflate some data with zlib headers and max output size
+ *
+ * @returns null if the compressed data is larger than `size`, otherwise the compressed data
+ */
+export function deflateMaxSize(bytes: Uint8Array, size: number): Uint8Array | null {
+    const outputPtr = wasm.__malloc(size)
+    const inputPtr = wasm.__malloc(bytes.length)
+    getUint8Memory().set(bytes, inputPtr)
+
+    const written = wasm.libdeflate_zlib_compress(compressor, inputPtr, bytes.length, outputPtr, size)
+    wasm.__free(inputPtr)
+
+    if (written === 0) {
+        wasm.__free(outputPtr)
+
+        return null
+    }
+
+    const result = getUint8Memory().slice(outputPtr, outputPtr + written)
+    wasm.__free(outputPtr)
+
+    return result
+}
+
+/**
+ * Try to decompress some data with zlib headers
+ *
+ * @throws  Error if the data is invalid
+ * @param defaultCapacity  default capacity of the output buffer. Defaults to `bytes.length * 2`
+ */
+export function gunzip(bytes: Uint8Array): Uint8Array {
+    const inputPtr = wasm.__malloc(bytes.length)
+    getUint8Memory().set(bytes, inputPtr)
+
+    const size = wasm.libdeflate_gzip_get_output_size(inputPtr, bytes.length)
+    const outputPtr = wasm.__malloc(size)
+
+    const ret = wasm.libdeflate_gzip_decompress(decompressor, inputPtr, bytes.length, outputPtr, size)
+
+    if (ret === -1) throw new Error('gunzip error -- bad data')
+    if (ret === -2) throw new Error('gunzip error -- short output')
+    if (ret === -3) throw new Error('gunzip error -- short input') // should never happen
+
+    const result = getUint8Memory().slice(outputPtr, outputPtr + size)
+    wasm.__free(inputPtr)
+    wasm.__free(outputPtr)
+
+    return result
+}
+
+/**
+ * Pefrorm AES-IGE-256 encryption
+ *
+ * @param data  data to encrypt (must be a multiple of 16 bytes)
+ * @param key  encryption key (32 bytes)
+ * @param iv  initialization vector (32 bytes)
+ */
+export function ige256Encrypt(data: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array {
+    const ptr = wasm.__malloc(key.length + iv.length + data.length + data.length)
+
+    const keyPtr = ptr
+    const ivPtr = ptr + key.length
+    const inputPtr = ivPtr + iv.length
+    const outputPtr = inputPtr + data.length
+
+    const mem = getUint8Memory()
+    mem.set(data, inputPtr)
+    mem.set(key, keyPtr)
+    mem.set(iv, ivPtr)
+
+    wasm.ige256_encrypt(inputPtr, data.length, keyPtr, ivPtr, outputPtr)
+    const result = getUint8Memory().slice(outputPtr, outputPtr + data.length)
+
+    wasm.__free(ptr)
+
+    return result
+}
+
+/**
+ * Pefrorm AES-IGE-256 decryption
+ *
+ * @param data  data to decrypt (must be a multiple of 16 bytes)
+ * @param key  encryption key (32 bytes)
+ * @param iv  initialization vector (32 bytes)
+ */
+export function ige256Decrypt(data: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array {
+    const ptr = wasm.__malloc(key.length + iv.length + data.length + data.length)
+
+    const keyPtr = ptr
+    const ivPtr = ptr + key.length
+    const inputPtr = ivPtr + iv.length
+    const outputPtr = inputPtr + data.length
+
+    const mem = getUint8Memory()
+    mem.set(data, inputPtr)
+    mem.set(key, keyPtr)
+    mem.set(iv, ivPtr)
+
+    wasm.ige256_decrypt(inputPtr, data.length, keyPtr, ivPtr, outputPtr)
+
+    const result = getUint8Memory().slice(outputPtr, outputPtr + data.length)
+    wasm.__free(ptr)
+
+    return result
+}
+
+/**
+ * Create a context for AES-CTR-256 en/decryption
+ *
+ * > **Note**: `freeCtr256` must be called on the returned context when it's no longer needed
+ */
+export function createCtr256(key: Uint8Array, iv: Uint8Array) {
+    const keyPtr = wasm.__malloc(key.length)
+    const ivPtr = wasm.__malloc(iv.length)
+    getUint8Memory().set(key, keyPtr)
+    getUint8Memory().set(iv, ivPtr)
+
+    const ctx = wasm.ctr256_alloc(keyPtr, ivPtr)
+    // pointers are "moved" and will be handled by c code
+
+    return ctx
+}
+
+/**
+ * Release a context for AES-CTR-256 en/decryption
+ */
+export function freeCtr256(ctx: number) {
+    wasm.ctr256_free(ctx)
+}
+
+/**
+ * Pefrorm AES-CTR-256 en/decryption
+ *
+ * @param ctx  context returned by `createCtr256`
+ * @param data  data to en/decrypt (must be a multiple of 16 bytes)
+ */
+export function ctr256(ctx: number, data: Uint8Array): Uint8Array {
+    const { __malloc, __free } = wasm
+    const inputPtr = __malloc(data.length)
+    const outputPtr = __malloc(data.length)
+
+    const mem = getUint8Memory()
+    mem.set(data, inputPtr)
+
+    wasm.ctr256(ctx, inputPtr, data.length, outputPtr)
+
+    const result = mem.slice(outputPtr, outputPtr + data.length)
+    __free(outputPtr)
+
+    return result
+}
+
+/**
+ * Get the WASM module instance.
+ *
+ * For debugging and testing purposes only
+ */
+export function __getWasm(): MtcuteWasmModule {
+    return wasm
+}
diff --git a/packages/wasm/src/init.ts b/packages/wasm/src/init.ts
new file mode 100644
index 00000000..794bf8f9
--- /dev/null
+++ b/packages/wasm/src/init.ts
@@ -0,0 +1,24 @@
+/* eslint-disable no-restricted-imports */
+import { readFile } from 'fs/promises'
+import { join } from 'path'
+
+import { InitInput } from './types.js'
+
+// @only-if-esm
+const __dirname = new URL('.', import.meta.url).pathname
+// @/only-if-esm
+
+export async function loadWasmBinary(input?: InitInput): Promise<WebAssembly.Instance> {
+    if (typeof input === 'undefined') {
+        input = join(__dirname, '../lib/mtcute.wasm')
+    }
+
+    if (typeof input !== 'string') {
+        throw new Error('Invalid input, for Node.js pass path to wasm blob')
+    }
+
+    const module = new WebAssembly.Module(await readFile(input))
+    const instance = new WebAssembly.Instance(module)
+
+    return instance
+}
diff --git a/packages/wasm/src/init.web.ts b/packages/wasm/src/init.web.ts
new file mode 100644
index 00000000..51f09013
--- /dev/null
+++ b/packages/wasm/src/init.web.ts
@@ -0,0 +1,42 @@
+import { InitInput } from './types.js'
+
+export async function loadWasmBinary(input?: InitInput): Promise<WebAssembly.Instance> {
+    if (typeof input === 'undefined') {
+        input = new URL('../mtcute.wasm', import.meta.url)
+    }
+
+    if (
+        typeof input === 'string' ||
+        (typeof Request === 'function' && input instanceof Request) ||
+        (typeof URL === 'function' && input instanceof URL)
+    ) {
+        input = await fetch(input)
+    }
+
+    if (typeof Response === 'function' && input instanceof Response) {
+        if (typeof WebAssembly.instantiateStreaming === 'function') {
+            try {
+                const { instance } = await WebAssembly.instantiateStreaming(input)
+
+                return instance
+            } catch (e) {
+                if (input.headers.get('Content-Type') !== 'application/wasm') {
+                    console.warn(
+                        '`WebAssembly.instantiateStreaming` failed because your server does not serve wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n',
+                        e,
+                    )
+                } else {
+                    throw e
+                }
+            }
+        }
+
+        const bytes = await input.arrayBuffer()
+
+        const { instance } = await WebAssembly.instantiate(bytes)
+
+        return instance
+    }
+
+    return await WebAssembly.instantiate(input)
+}
diff --git a/packages/wasm/src/types.ts b/packages/wasm/src/types.ts
new file mode 100644
index 00000000..9828a771
--- /dev/null
+++ b/packages/wasm/src/types.ts
@@ -0,0 +1,24 @@
+export interface MtcuteWasmModule {
+    memory: WebAssembly.Memory
+    __malloc: (size: number) => number
+    __free: (ptr: number) => void
+    libdeflate_alloc_decompressor: () => number
+    libdeflate_alloc_compressor: (level: number) => number
+
+    /** @returns if !=0 - error */
+    libdeflate_gzip_decompress: (ctx: number, src: number, srcLen: number, dst: number, dstLen: number) => number
+    libdeflate_gzip_get_output_size: (src: number, srcLen: number) => number
+
+    libdeflate_zlib_compress: (ctx: number, src: number, srcLen: number, dst: number, dstLen: number) => number
+
+    ige256_encrypt: (data: number, dataLen: number, key: number, iv: number, out: number) => void
+
+    ige256_decrypt: (data: number, dataLen: number, key: number, iv: number, out: number) => void
+
+    ctr256_alloc: (key: number, iv: number) => number
+    ctr256_free: (ctx: number) => void
+    ctr256: (ctx: number, data: number, dataLen: number, out: number) => number
+}
+
+export type SyncInitInput = BufferSource | WebAssembly.Module
+export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module
diff --git a/packages/wasm/tests/allocator.spec.ts b/packages/wasm/tests/allocator.spec.ts
new file mode 100644
index 00000000..790113c7
--- /dev/null
+++ b/packages/wasm/tests/allocator.spec.ts
@@ -0,0 +1,21 @@
+import { expect } from 'chai'
+
+import { __getWasm, initAsync } from '../src/index.js'
+
+before(async () => {
+    await initAsync()
+})
+
+describe('allocator', () => {
+    it('should not leak memory', () => {
+        const wasm = __getWasm()
+        const memUsage = wasm.memory.buffer.byteLength
+
+        for (let i = 0; i < 1024; i++) {
+            const ptr = wasm.__malloc(1024)
+            wasm.__free(ptr)
+        }
+
+        expect(wasm.memory.buffer.byteLength).to.equal(memUsage)
+    })
+})
diff --git a/packages/wasm/tests/ctr.spec.ts b/packages/wasm/tests/ctr.spec.ts
new file mode 100644
index 00000000..95dd5915
--- /dev/null
+++ b/packages/wasm/tests/ctr.spec.ts
@@ -0,0 +1,149 @@
+/* eslint-disable no-restricted-globals */
+import { expect } from 'chai'
+import { before, describe } from 'mocha'
+
+import { __getWasm, createCtr256, ctr256, freeCtr256, initAsync } from '../src/index.js'
+
+before(async () => {
+    await initAsync()
+})
+
+describe('aes-ctr', () => {
+    const key = Buffer.from('603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4', 'hex')
+    const iv = Buffer.from('F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF', 'hex')
+
+    describe('NIST', () => {
+        // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_CTR.pdf
+        const data = Buffer.from(
+            `6BC1BEE2 2E409F96 E93D7E11 7393172A
+            AE2D8A57 1E03AC9C 9EB76FAC 45AF8E51
+            30C81C46 A35CE411 E5FBC119 1A0A52EF
+            F69F2445 DF4F9B17 AD2B417B E66C3710`.replace(/\s/g, ''),
+            'hex',
+        )
+        const dataEnc = Buffer.from(
+            `601EC313 775789A5 B7A7F504 BBF3D228
+            F443E3CA 4D62B59A CA84E990 CACAF5C5
+            2B0930DA A23DE94C E87017BA 2D84988D
+            DFC9C58D B67AADA6 13C2DD08 457941A6`.replace(/\s/g, ''),
+            'hex',
+        )
+
+        it('should correctly encrypt', () => {
+            const ctr = createCtr256(key, iv)
+            const res = ctr256(ctr, data)
+            freeCtr256(ctr)
+
+            expect(Buffer.from(res).toString('hex')).to.equal(dataEnc.toString('hex'))
+        })
+
+        it('should correctly decrypt', () => {
+            const ctr = createCtr256(key, iv)
+            const res = ctr256(ctr, dataEnc)
+            freeCtr256(ctr)
+
+            expect(Buffer.from(res).toString('hex')).to.equal(data.toString('hex'))
+        })
+    })
+
+    describe('stream', () => {
+        const data = Buffer.from('6BC1BEE22E409F96E93D7E117393172A', 'hex')
+        const dataEnc1 = Buffer.from('601ec313775789a5b7a7f504bbf3d228', 'hex')
+        const dataEnc2 = Buffer.from('31afd77f7d218690bd0ef82dfcf66cbe', 'hex')
+        const dataEnc3 = Buffer.from('7000927e2f2192cbe4b6a8b2441ddd48', 'hex')
+
+        it('should correctly encrypt', () => {
+            const ctr = createCtr256(key, iv)
+            const res1 = ctr256(ctr, data)
+            const res2 = ctr256(ctr, data)
+            const res3 = ctr256(ctr, data)
+
+            freeCtr256(ctr)
+
+            expect(Buffer.from(res1).toString('hex')).to.equal(dataEnc1.toString('hex'))
+            expect(Buffer.from(res2).toString('hex')).to.equal(dataEnc2.toString('hex'))
+            expect(Buffer.from(res3).toString('hex')).to.equal(dataEnc3.toString('hex'))
+        })
+
+        it('should correctly decrypt', () => {
+            const ctr = createCtr256(key, iv)
+            const res1 = ctr256(ctr, dataEnc1)
+            const res2 = ctr256(ctr, dataEnc2)
+            const res3 = ctr256(ctr, dataEnc3)
+
+            freeCtr256(ctr)
+
+            expect(Buffer.from(res1).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res2).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res3).toString('hex')).to.equal(data.toString('hex'))
+        })
+    })
+
+    describe('stream (unaligned)', () => {
+        const data = Buffer.from('6BC1BEE22E40', 'hex')
+        const dataEnc1 = Buffer.from('601ec3137757', 'hex')
+        const dataEnc2 = Buffer.from('7df2e078a555', 'hex')
+        const dataEnc3 = Buffer.from('a3a17be0742e', 'hex')
+        const dataEnc4 = Buffer.from('025ced833746', 'hex')
+        const dataEnc5 = Buffer.from('3ff238dea125', 'hex')
+        const dataEnc6 = Buffer.from('1055a52302dc', 'hex')
+
+        it('should correctly encrypt', () => {
+            const ctr = createCtr256(key, iv)
+            const res1 = ctr256(ctr, data)
+            const res2 = ctr256(ctr, data)
+            const res3 = ctr256(ctr, data)
+            const res4 = ctr256(ctr, data)
+            const res5 = ctr256(ctr, data)
+            const res6 = ctr256(ctr, data)
+
+            freeCtr256(ctr)
+
+            expect(Buffer.from(res1).toString('hex')).to.equal(dataEnc1.toString('hex'))
+            expect(Buffer.from(res2).toString('hex')).to.equal(dataEnc2.toString('hex'))
+            expect(Buffer.from(res3).toString('hex')).to.equal(dataEnc3.toString('hex'))
+            expect(Buffer.from(res4).toString('hex')).to.equal(dataEnc4.toString('hex'))
+            expect(Buffer.from(res5).toString('hex')).to.equal(dataEnc5.toString('hex'))
+            expect(Buffer.from(res6).toString('hex')).to.equal(dataEnc6.toString('hex'))
+        })
+
+        it('should correctly decrypt', () => {
+            const ctr = createCtr256(key, iv)
+            const res1 = ctr256(ctr, dataEnc1)
+            const res2 = ctr256(ctr, dataEnc2)
+            const res3 = ctr256(ctr, dataEnc3)
+            const res4 = ctr256(ctr, dataEnc4)
+            const res5 = ctr256(ctr, dataEnc5)
+            const res6 = ctr256(ctr, dataEnc6)
+
+            freeCtr256(ctr)
+
+            expect(Buffer.from(res1).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res2).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res3).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res4).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res5).toString('hex')).to.equal(data.toString('hex'))
+            expect(Buffer.from(res6).toString('hex')).to.equal(data.toString('hex'))
+        })
+    })
+
+    it('should not leak memory', () => {
+        const data = Buffer.from('6BC1BEE22E409F96E93D7E117393172A', 'hex')
+        const mem = __getWasm().memory.buffer
+        const memSize = mem.byteLength
+
+        for (let i = 0; i < 100; i++) {
+            const ctrEnc = createCtr256(key, iv)
+            const ctrDec = createCtr256(key, iv)
+
+            for (let i = 0; i < 100; i++) {
+                ctr256(ctrDec, ctr256(ctrEnc, data))
+            }
+
+            freeCtr256(ctrEnc)
+            freeCtr256(ctrDec)
+        }
+
+        expect(mem.byteLength).to.equal(memSize)
+    })
+})
diff --git a/packages/wasm/tests/gunzip.spec.ts b/packages/wasm/tests/gunzip.spec.ts
new file mode 100644
index 00000000..8111b829
--- /dev/null
+++ b/packages/wasm/tests/gunzip.spec.ts
@@ -0,0 +1,46 @@
+/* eslint-disable no-restricted-globals */
+import { expect } from 'chai'
+import { before, describe } from 'mocha'
+import { gzipSync } from 'zlib'
+
+import { __getWasm, gunzip, initAsync } from '../src/index.js'
+
+before(async () => {
+    await initAsync()
+})
+
+describe('gunzip', () => {
+    it('should correctly read zlib headers', () => {
+        const wasm = __getWasm()
+        const data = gzipSync(Buffer.from('hello world'))
+
+        const inputPtr = wasm.__malloc(data.length)
+        new Uint8Array(wasm.memory.buffer).set(data, inputPtr)
+
+        expect(wasm.libdeflate_gzip_get_output_size(inputPtr, data.length)).to.equal(11)
+    })
+
+    it('should correctly inflate', () => {
+        const data = Array.from({ length: 1000 }, () => 'a').join('')
+        const res = gzipSync(Buffer.from(data))
+
+        expect(res).not.to.be.null
+        expect(res.length).to.be.lessThan(100)
+        expect(gunzip(res)).to.deep.equal(new Uint8Array(Buffer.from(data)))
+    })
+
+    it('should not leak memory', () => {
+        const memSize = __getWasm().memory.buffer.byteLength
+
+        for (let i = 0; i < 100; i++) {
+            const data = Array.from({ length: 1000 }, () => 'a').join('')
+            const deflated = gzipSync(Buffer.from(data))
+
+            const res = gunzip(deflated)
+
+            expect(Buffer.from(res).toString()).to.equal(data)
+        }
+
+        expect(__getWasm().memory.buffer.byteLength).to.equal(memSize)
+    })
+})
diff --git a/packages/wasm/tests/ige.spec.ts b/packages/wasm/tests/ige.spec.ts
new file mode 100644
index 00000000..6bfcd2bd
--- /dev/null
+++ b/packages/wasm/tests/ige.spec.ts
@@ -0,0 +1,40 @@
+/* eslint-disable no-restricted-globals */
+import { expect } from 'chai'
+import { before, describe } from 'mocha'
+
+import { __getWasm, ige256Decrypt, ige256Encrypt, initAsync } from '../src/index.js'
+
+before(async () => {
+    await initAsync()
+})
+
+describe('aes-ige', () => {
+    const key = Buffer.from('5468697320697320616E20696D706C655468697320697320616E20696D706C65', 'hex')
+    const iv = Buffer.from('6D656E746174696F6E206F6620494745206D6F646520666F72204F70656E5353', 'hex')
+
+    const data = Buffer.from('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b', 'hex')
+    const dataEnc = Buffer.from('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69', 'hex')
+
+    it('should correctly encrypt', () => {
+        const aes = ige256Encrypt(data, key, iv)
+
+        expect(Buffer.from(aes).toString('hex')).to.equal(dataEnc.toString('hex'))
+    })
+
+    it('should correctly decrypt', () => {
+        const aes = ige256Decrypt(dataEnc, key, iv)
+
+        expect(Buffer.from(aes).toString('hex')).to.equal(data.toString('hex'))
+    })
+
+    it('should not leak memory', () => {
+        const mem = __getWasm().memory.buffer
+        const memSize = mem.byteLength
+
+        for (let i = 0; i < 10000; i++) {
+            ige256Decrypt(ige256Encrypt(data, key, iv), key, iv)
+        }
+
+        expect(mem.byteLength).to.equal(memSize)
+    })
+})
diff --git a/packages/wasm/tests/tsconfig.json b/packages/wasm/tests/tsconfig.json
new file mode 100644
index 00000000..23b6b033
--- /dev/null
+++ b/packages/wasm/tests/tsconfig.json
@@ -0,0 +1,9 @@
+{
+    "extends": "../../../tsconfig.json",
+    "include": [
+        "."
+    ],
+    "references": [
+        { "path": "../" }
+    ]
+}
diff --git a/packages/wasm/tests/zlib.spec.ts b/packages/wasm/tests/zlib.spec.ts
new file mode 100644
index 00000000..7a35dc90
--- /dev/null
+++ b/packages/wasm/tests/zlib.spec.ts
@@ -0,0 +1,49 @@
+/* eslint-disable no-restricted-globals */
+import { expect } from 'chai'
+import { before, describe } from 'mocha'
+import { inflateSync } from 'zlib'
+
+import { __getWasm, deflateMaxSize, initAsync } from '../src/index.js'
+
+before(async () => {
+    await initAsync()
+})
+
+describe('zlib deflate', () => {
+    it('should add zlib headers', () => {
+        const res = deflateMaxSize(Buffer.from('hello world'), 100)
+
+        expect(res).not.to.be.null
+        expect(res!.slice(0, 2)).to.deep.equal(Buffer.from([0x78, 0x9c]))
+    })
+
+    it('should return null if compressed data is larger than size', () => {
+        const res = deflateMaxSize(Buffer.from('hello world'), 1)
+
+        expect(res).to.be.null
+    })
+
+    it('should correctly deflate', () => {
+        const data = Array.from({ length: 1000 }, () => 'a').join('')
+        const res = deflateMaxSize(Buffer.from(data), 100)
+
+        expect(res).not.to.be.null
+        expect(res!.length).to.be.lessThan(100)
+        expect(inflateSync(res!)).to.deep.equal(Buffer.from(data))
+    })
+
+    it('should not leak memory', () => {
+        const memSize = __getWasm().memory.buffer.byteLength
+
+        for (let i = 0; i < 100; i++) {
+            const data = Array.from({ length: 1000 }, () => 'a').join('')
+            const deflated = deflateMaxSize(Buffer.from(data), 100)
+
+            const res = inflateSync(deflated!)
+
+            expect(Buffer.from(res).toString()).to.equal(data)
+        }
+
+        expect(__getWasm().memory.buffer.byteLength).to.equal(memSize)
+    })
+})
diff --git a/packages/wasm/tsconfig.json b/packages/wasm/tsconfig.json
new file mode 100644
index 00000000..6a3f7a53
--- /dev/null
+++ b/packages/wasm/tsconfig.json
@@ -0,0 +1,10 @@
+{
+    "extends": "../../tsconfig.json",
+    "compilerOptions": {
+        "outDir": "./dist/esm",
+        "rootDir": "./src"
+    },
+    "include": [
+        "./src",
+    ]
+}
diff --git a/packages/wasm/typedoc.cjs b/packages/wasm/typedoc.cjs
new file mode 100644
index 00000000..84e0d8c2
--- /dev/null
+++ b/packages/wasm/typedoc.cjs
@@ -0,0 +1,4 @@
+module.exports = {
+    extends: ['../../typedoc.base.cjs'],
+    entryPoints: ['./src/index.ts'],
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 56488ef2..2bde8c29 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -126,6 +126,9 @@ importers:
       '@mtcute/tl-runtime':
         specifier: workspace:^
         version: link:../tl-runtime
+      '@mtcute/wasm':
+        specifier: workspace:^
+        version: link:../wasm
       '@types/events':
         specifier: 3.0.0
         version: 3.0.0
@@ -139,9 +142,6 @@ importers:
         specifier: 5.2.3
         version: 5.2.3
     devDependencies:
-      '@cryptography/aes':
-        specifier: ^0.1.1
-        version: 0.1.1
       '@types/ws':
         specifier: 8.5.4
         version: 8.5.4
@@ -312,13 +312,6 @@ importers:
       long:
         specifier: 5.2.3
         version: 5.2.3
-      pako:
-        specifier: 2.1.0
-        version: 2.1.0
-    devDependencies:
-      '@types/pako':
-        specifier: 2.0.0
-        version: 2.0.0
 
   packages/tl-utils:
     dependencies:
@@ -330,6 +323,8 @@ importers:
         specifier: workspace:^
         version: link:../tl-runtime
 
+  packages/wasm: {}
+
 packages:
 
   /@aashutoshrathi/word-wrap@1.2.6:
@@ -702,10 +697,6 @@ packages:
       chalk: 4.1.2
     dev: true
 
-  /@cryptography/aes@0.1.1:
-    resolution: {integrity: sha512-PcYz4FDGblO6tM2kSC+VzhhK62vml6k6/YAkiWtyPvrgJVfnDRoHGDtKn5UiaRRUrvUTTocBpvc2rRgTCqxjsg==}
-    dev: true
-
   /@cspotcode/source-map-support@0.8.1:
     resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==}
     engines: {node: '>=12'}
@@ -968,10 +959,6 @@ packages:
     resolution: {integrity: sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw==}
     dev: true
 
-  /@types/pako@2.0.0:
-    resolution: {integrity: sha512-10+iaz93qR5WYxTo+PMifD5TSxiOtdRaxBf7INGGXMQgTCu8Z/7GYWYFUOS3q/G0nE5boj1r4FEB+WSy7s5gbA==}
-    dev: true
-
   /@types/semver@7.5.0:
     resolution: {integrity: sha512-G8hZ6XJiHnuhQKR7ZmysCeJWE08o8T0AXtk5darsCaTVsYZhhgUrq53jizaR2FvsoeCwJhlmwTjkXBY5Pn/ZHw==}
     dev: true
@@ -4311,10 +4298,6 @@ packages:
       release-zalgo: 1.0.0
     dev: true
 
-  /pako@2.1.0:
-    resolution: {integrity: sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug==}
-    dev: false
-
   /parent-module@1.0.1:
     resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
     engines: {node: '>=6'}
diff --git a/scripts/build-package.js b/scripts/build-package.js
index 4aa645a2..2512d761 100644
--- a/scripts/build-package.js
+++ b/scripts/build-package.js
@@ -50,14 +50,10 @@ const buildConfig = {
             })
         }
 
-        console.log(config)
-
         return config
     })(),
 }
 
-console.log(buildConfig)
-
 function buildPackageJson() {
     const pkgJson = JSON.parse(fs.readFileSync(path.join(packageDir, 'package.json'), 'utf-8'))