From eec142f0e50b7c74bc9105237c87e3786ef54e10 Mon Sep 17 00:00:00 2001 From: Alina Sireneva Date: Sat, 4 Nov 2023 06:44:18 +0300 Subject: [PATCH] =?UTF-8?q?feat:=20wasm!=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/core/package.json | 2 +- packages/core/src/base-client.ts | 1 + packages/core/src/network/auth-key.ts | 13 +- packages/core/src/network/authorization.ts | 10 +- .../core/src/network/session-connection.ts | 78 +- .../core/src/network/transports/obfuscated.ts | 28 +- packages/core/src/utils/crypto/abstract.ts | 31 +- packages/core/src/utils/crypto/common.ts | 67 - packages/core/src/utils/crypto/node-crypto.ts | 64 - packages/core/src/utils/crypto/node.ts | 93 + .../src/utils/crypto/{subtle.ts => web.ts} | 71 +- packages/core/src/utils/platform/crypto.ts | 2 +- .../core/src/utils/platform/crypto.web.ts | 4 +- packages/core/tests/auth-key.spec.ts | 2 +- packages/core/tests/crypto-providers.spec.ts | 86 +- packages/core/tests/keys.spec.ts | 2 +- packages/core/tests/mtproto-crypto.spec.ts | 2 +- packages/crypto-node/.gitignore | 2 +- packages/crypto-node/src/index.ts | 4 +- packages/crypto/README.md | 5 - packages/crypto/cbc256.c | 36 - packages/crypto/cbc256.h | 17 - packages/crypto/ctr256.c | 32 - packages/crypto/ctr256.h | 8 - packages/tl-runtime/package.json | 10 +- packages/tl-runtime/src/encodings/gzip.ts | 39 - packages/tl-runtime/src/encodings/gzip.web.ts | 40 - packages/tl-runtime/src/encodings/index.ts | 1 - packages/tl-runtime/src/index.ts | 1 - packages/tl-runtime/src/reader.ts | 10 +- packages/tl/scripts/gen-rsa-keys.ts | 2 +- packages/wasm/.gitignore | 1 + packages/wasm/README.md | 19 + packages/wasm/build.config.cjs | 20 + packages/wasm/lib/Dockerfile | 14 + packages/wasm/lib/Makefile | 75 + packages/wasm/lib/common_defs.h | 686 +++ packages/wasm/lib/crypto/COPYING.lesser | 165 + packages/{ => wasm/lib}/crypto/aes256.c | 0 packages/{ => wasm/lib}/crypto/aes256.h | 5 +- packages/wasm/lib/crypto/ctr256.c | 55 + packages/wasm/lib/crypto/ctr256.h | 6 + packages/{ => wasm/lib}/crypto/ige256.c | 4 +- packages/{ => wasm/lib}/crypto/ige256.h | 0 packages/wasm/lib/lib_common.h | 62 + packages/wasm/lib/libdeflate.h | 245 + packages/wasm/lib/libdeflate/COPYING | 21 + packages/wasm/lib/libdeflate/adler32.c | 123 + packages/wasm/lib/libdeflate/adler32.h | 8 + packages/wasm/lib/libdeflate/bt_matchfinder.h | 342 ++ .../wasm/lib/libdeflate/decompress_template.h | 777 ++++ .../wasm/lib/libdeflate/deflate_compress.c | 4119 +++++++++++++++++ .../wasm/lib/libdeflate/deflate_compress.h | 20 + .../wasm/lib/libdeflate/deflate_constants.h | 56 + .../wasm/lib/libdeflate/deflate_decompress.c | 1200 +++++ .../wasm/lib/libdeflate/deflate_decompress.h | 14 + packages/wasm/lib/libdeflate/gzip_constants.h | 45 + .../wasm/lib/libdeflate/gzip_decompress.c | 160 + packages/wasm/lib/libdeflate/hc_matchfinder.h | 401 ++ packages/wasm/lib/libdeflate/ht_matchfinder.h | 234 + .../wasm/lib/libdeflate/matchfinder_common.h | 194 + packages/wasm/lib/libdeflate/zlib_compress.c | 83 + packages/wasm/lib/libdeflate/zlib_constants.h | 21 + packages/wasm/lib/mtcute.wasm | Bin 0 -> 45120 bytes packages/wasm/lib/utils.c | 137 + packages/wasm/package.json | 29 + packages/wasm/src/index.ts | 213 + packages/wasm/src/init.ts | 24 + packages/wasm/src/init.web.ts | 42 + packages/wasm/src/types.ts | 24 + packages/wasm/tests/allocator.spec.ts | 21 + packages/wasm/tests/ctr.spec.ts | 149 + packages/wasm/tests/gunzip.spec.ts | 46 + packages/wasm/tests/ige.spec.ts | 40 + packages/wasm/tests/tsconfig.json | 9 + packages/wasm/tests/zlib.spec.ts | 49 + packages/wasm/tsconfig.json | 10 + packages/wasm/typedoc.cjs | 4 + pnpm-lock.yaml | 27 +- scripts/build-package.js | 4 - 80 files changed, 10231 insertions(+), 535 deletions(-) delete mode 100644 packages/core/src/utils/crypto/common.ts delete mode 100644 packages/core/src/utils/crypto/node-crypto.ts create mode 100644 packages/core/src/utils/crypto/node.ts rename packages/core/src/utils/crypto/{subtle.ts => web.ts} (54%) delete mode 100644 packages/crypto/README.md delete mode 100644 packages/crypto/cbc256.c delete mode 100644 packages/crypto/cbc256.h delete mode 100644 packages/crypto/ctr256.c delete mode 100644 packages/crypto/ctr256.h delete mode 100644 packages/tl-runtime/src/encodings/gzip.ts delete mode 100644 packages/tl-runtime/src/encodings/gzip.web.ts create mode 100644 packages/wasm/.gitignore create mode 100644 packages/wasm/README.md create mode 100644 packages/wasm/build.config.cjs create mode 100644 packages/wasm/lib/Dockerfile create mode 100644 packages/wasm/lib/Makefile create mode 100644 packages/wasm/lib/common_defs.h create mode 100644 packages/wasm/lib/crypto/COPYING.lesser rename packages/{ => wasm/lib}/crypto/aes256.c (100%) rename packages/{ => wasm/lib}/crypto/aes256.h (84%) create mode 100644 packages/wasm/lib/crypto/ctr256.c create mode 100644 packages/wasm/lib/crypto/ctr256.h rename packages/{ => wasm/lib}/crypto/ige256.c (84%) rename packages/{ => wasm/lib}/crypto/ige256.h (100%) create mode 100644 packages/wasm/lib/lib_common.h create mode 100644 packages/wasm/lib/libdeflate.h create mode 100644 packages/wasm/lib/libdeflate/COPYING create mode 100644 packages/wasm/lib/libdeflate/adler32.c create mode 100644 packages/wasm/lib/libdeflate/adler32.h create mode 100644 packages/wasm/lib/libdeflate/bt_matchfinder.h create mode 100644 packages/wasm/lib/libdeflate/decompress_template.h create mode 100644 packages/wasm/lib/libdeflate/deflate_compress.c create mode 100644 packages/wasm/lib/libdeflate/deflate_compress.h create mode 100644 packages/wasm/lib/libdeflate/deflate_constants.h create mode 100644 packages/wasm/lib/libdeflate/deflate_decompress.c create mode 100644 packages/wasm/lib/libdeflate/deflate_decompress.h create mode 100644 packages/wasm/lib/libdeflate/gzip_constants.h create mode 100644 packages/wasm/lib/libdeflate/gzip_decompress.c create mode 100644 packages/wasm/lib/libdeflate/hc_matchfinder.h create mode 100644 packages/wasm/lib/libdeflate/ht_matchfinder.h create mode 100644 packages/wasm/lib/libdeflate/matchfinder_common.h create mode 100644 packages/wasm/lib/libdeflate/zlib_compress.c create mode 100644 packages/wasm/lib/libdeflate/zlib_constants.h create mode 100755 packages/wasm/lib/mtcute.wasm create mode 100644 packages/wasm/lib/utils.c create mode 100644 packages/wasm/package.json create mode 100644 packages/wasm/src/index.ts create mode 100644 packages/wasm/src/init.ts create mode 100644 packages/wasm/src/init.web.ts create mode 100644 packages/wasm/src/types.ts create mode 100644 packages/wasm/tests/allocator.spec.ts create mode 100644 packages/wasm/tests/ctr.spec.ts create mode 100644 packages/wasm/tests/gunzip.spec.ts create mode 100644 packages/wasm/tests/ige.spec.ts create mode 100644 packages/wasm/tests/tsconfig.json create mode 100644 packages/wasm/tests/zlib.spec.ts create mode 100644 packages/wasm/tsconfig.json create mode 100644 packages/wasm/typedoc.cjs diff --git a/packages/core/package.json b/packages/core/package.json index e11ecdf9..078cde3c 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -46,13 +46,13 @@ "dependencies": { "@mtcute/tl": "workspace:^", "@mtcute/tl-runtime": "workspace:^", + "@mtcute/wasm": "workspace:^", "@types/events": "3.0.0", "big-integer": "1.6.51", "events": "3.2.0", "long": "5.2.3" }, "devDependencies": { - "@cryptography/aes": "^0.1.1", "@types/ws": "8.5.4", "node-forge": "1.3.1", "ws": "8.13.0" diff --git a/packages/core/src/base-client.ts b/packages/core/src/base-client.ts index 16da46d3..93b54246 100644 --- a/packages/core/src/base-client.ts +++ b/packages/core/src/base-client.ts @@ -361,6 +361,7 @@ export class BaseTelegramClient extends EventEmitter { const promise = (this._connected = createControllablePromise()) + await this.crypto.initialize?.() await this._loadStorage() const primaryDc = await this.storage.getDefaultDcs() if (primaryDc !== null) this._defaultDcs = primaryDc diff --git a/packages/core/src/network/auth-key.ts b/packages/core/src/network/auth-key.ts index deb12791..af6ec415 100644 --- a/packages/core/src/network/auth-key.ts +++ b/packages/core/src/network/auth-key.ts @@ -5,7 +5,14 @@ import { TlBinaryReader, TlReaderMap } from '@mtcute/tl-runtime' import { MtcuteError } from '../types/errors.js' import { createAesIgeForMessage } from '../utils/crypto/mtproto.js' -import { buffersEqual, concatBuffers, dataViewFromBuffer, ICryptoProvider, Logger, randomBytes } from '../utils/index.js' +import { + buffersEqual, + concatBuffers, + dataViewFromBuffer, + ICryptoProvider, + Logger, + randomBytes, +} from '../utils/index.js' export class AuthKey { ready = false @@ -55,7 +62,7 @@ export class AuthKey { const messageKey = (await this._crypto.sha256(concatBuffers([this.clientSalt, buf]))).subarray(8, 24) const ige = await createAesIgeForMessage(this._crypto, this.key, messageKey, true) - const encryptedData = await ige.encrypt(buf) + const encryptedData = ige.encrypt(buf) return concatBuffers([this.id, messageKey, encryptedData]) } @@ -78,7 +85,7 @@ export class AuthKey { } const ige = await createAesIgeForMessage(this._crypto, this.key, messageKey, false) - const innerData = await ige.decrypt(encryptedData) + const innerData = ige.decrypt(encryptedData) const msgKeySource = await this._crypto.sha256(concatBuffers([this.serverSalt, innerData])) const expectedMessageKey = msgKeySource.subarray(8, 24) diff --git a/packages/core/src/network/authorization.ts b/packages/core/src/network/authorization.ts index 7b184a5e..82f7a39b 100644 --- a/packages/core/src/network/authorization.ts +++ b/packages/core/src/network/authorization.ts @@ -141,8 +141,8 @@ async function rsaPad(data: Uint8Array, crypto: ICryptoProvider, key: TlPublicKe // we only need to reverse the data dataWithHash.subarray(0, 192).reverse() - const aes = await crypto.createAesIge(aesKey, aesIv) - const encrypted = await aes.encrypt(dataWithHash) + const aes = crypto.createAesIge(aesKey, aesIv) + const encrypted = aes.encrypt(dataWithHash) const encryptedHash = await crypto.sha256(encrypted) xorBufferInPlace(aesKey, encryptedHash) @@ -300,9 +300,9 @@ export async function doAuthorization( // Step 3: complete DH exchange const [key, iv] = await generateKeyAndIvFromNonce(crypto, resPq.serverNonce, newNonce) - const ige = await crypto.createAesIge(key, iv) + const ige = crypto.createAesIge(key, iv) - const plainTextAnswer = await ige.decrypt(serverDhParams.encryptedAnswer) + const plainTextAnswer = ige.decrypt(serverDhParams.encryptedAnswer) const innerDataHash = plainTextAnswer.subarray(0, 20) const serverDhInnerReader = new TlBinaryReader(readerMap, plainTextAnswer, 20) const serverDhInner = serverDhInnerReader.object() as mtp.TlObject @@ -379,7 +379,7 @@ export async function doAuthorization( log.debug('sending client DH (timeOffset = %d)', timeOffset) - const clientDhEncrypted = await ige.encrypt(clientDhInnerWriter.uint8View) + const clientDhEncrypted = ige.encrypt(clientDhInnerWriter.uint8View) await sendPlainMessage({ _: 'mt_set_client_DH_params', nonce, diff --git a/packages/core/src/network/session-connection.ts b/packages/core/src/network/session-connection.ts index e3e4afee..db8a4e36 100644 --- a/packages/core/src/network/session-connection.ts +++ b/packages/core/src/network/session-connection.ts @@ -3,15 +3,7 @@ import Long from 'long' import { mtp, tl } from '@mtcute/tl' -import { - gzipDeflate, - gzipInflate, - TlBinaryReader, - TlBinaryWriter, - TlReaderMap, - TlSerializationCounter, - TlWriterMap, -} from '@mtcute/tl-runtime' +import { TlBinaryReader, TlBinaryWriter, TlReaderMap, TlSerializationCounter, TlWriterMap } from '@mtcute/tl-runtime' import { MtArgumentError, MtcuteError, MtTimeoutError } from '../types/index.js' import { createAesIgeForMessageOld } from '../utils/crypto/mtproto.js' @@ -20,6 +12,7 @@ import { ControllablePromise, createControllablePromise, EarlyTimer, + ICryptoProvider, longFromBuffer, randomBytes, randomLong, @@ -51,6 +44,12 @@ export interface SessionConnectionParams extends PersistentConnectionParams { // destroy_auth_key#d1435160 = DestroyAuthKeyRes; // const DESTROY_AUTH_KEY = Buffer.from('605134d1', 'hex') +// gzip_packed#3072cfa1 packed_data:string = Object; +const GZIP_PACKED_ID = 0x3072cfa1 +// msg_container#73f1f8dc messages:vector<%Message> = MessageContainer; +const MSG_CONTAINER_ID = 0x73f1f8dc +// rpc_result#f35c6d01 req_msg_id:long result:Object = RpcResult; +const RPC_RESULT_ID = 0xf35c6d01 function makeNiceStack(error: tl.RpcError, stack: string, method?: string) { error.stack = `RpcError (${error.code} ${error.text}): ${error.message}\n at ${method}\n${stack @@ -80,6 +79,7 @@ export class SessionConnection extends PersistentConnection { private _readerMap: TlReaderMap private _writerMap: TlWriterMap + private _crypto: ICryptoProvider constructor( params: SessionConnectionParams, @@ -90,6 +90,7 @@ export class SessionConnection extends PersistentConnection { this._readerMap = params.readerMap this._writerMap = params.writerMap + this._crypto = params.crypto this._handleRawMessage = this._handleRawMessage.bind(this) } @@ -265,7 +266,7 @@ export class SessionConnection extends PersistentConnection { this._session.authorizationPending = true this.emit('auth-begin') - doAuthorization(this, this.params.crypto) + doAuthorization(this, this._crypto) .then(async ([authKey, serverSalt, timeOffset]) => { await this._session._authKey.setup(authKey) this._session.serverSalt = serverSalt @@ -312,7 +313,7 @@ export class SessionConnection extends PersistentConnection { this._isPfsBindingPending = true } - doAuthorization(this, this.params.crypto, TEMP_AUTH_KEY_EXPIRY) + doAuthorization(this, this._crypto, TEMP_AUTH_KEY_EXPIRY) .then(async ([tempAuthKey, tempServerSalt]) => { if (!this._usePfs) { this.log.info('pfs has been disabled while generating temp key') @@ -357,16 +358,11 @@ export class SessionConnection extends PersistentConnection { writer.raw(randomBytes(8)) const msgWithPadding = writer.result() - const hash = await this.params.crypto.sha1(msgWithoutPadding) + const hash = await this._crypto.sha1(msgWithoutPadding) const msgKey = hash.subarray(4, 20) - const ige = await createAesIgeForMessageOld( - this.params.crypto, - this._session._authKey.key, - msgKey, - true, - ) - const encryptedData = await ige.encrypt(msgWithPadding) + const ige = await createAesIgeForMessageOld(this._crypto, this._session._authKey.key, msgKey, true) + const encryptedData = ige.encrypt(msgWithPadding) const encryptedMessage = concatBuffers([this._session._authKey.id, msgKey, encryptedData]) const promise = createControllablePromise() @@ -512,22 +508,17 @@ export class SessionConnection extends PersistentConnection { } private _handleRawMessage(messageId: Long, seqNo: number, message: TlBinaryReader): void { - if (message.peekUint() === 0x3072cfa1) { - // gzip_packed - // we can't use message.gzip() because it may contain msg_container, - // so we parse it manually. - message.uint() + const objectId = message.uint() + if (objectId === GZIP_PACKED_ID) { return this._handleRawMessage( messageId, seqNo, - new TlBinaryReader(this._readerMap, gzipInflate(message.bytes())), + new TlBinaryReader(this._readerMap, this._crypto.gunzip(message.bytes())), ) } - if (message.peekUint() === 0x73f1f8dc) { - // msg_container - message.uint() + if (objectId === MSG_CONTAINER_ID) { const count = message.uint() for (let i = 0; i < count; i++) { @@ -545,15 +536,12 @@ export class SessionConnection extends PersistentConnection { return } - if (message.peekUint() === 0xf35c6d01) { - // rpc_result - message.uint() - + if (objectId === RPC_RESULT_ID) { return this._onRpcResult(messageId, message) } // we are safe.. i guess - this._handleMessage(messageId, message.object()) + this._handleMessage(messageId, message.object(objectId)) } private _handleMessage(messageId: Long, message_: unknown): void { @@ -729,7 +717,22 @@ export class SessionConnection extends PersistentConnection { const rpc = msg.rpc const customReader = this._readerMap._results![rpc.method] - const result: any = customReader ? customReader(message) : message.object() + + let result: any + + if (customReader) { + result = customReader(message) + } else { + const objectId = message.uint() + + if (objectId === GZIP_PACKED_ID) { + const inner = this._crypto.gunzip(message.bytes()) + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + result = TlBinaryReader.deserializeObject(this._readerMap, inner) + } else { + result = message.object(objectId) + } + } // initConnection call was definitely received and // processed by the server, so we no longer need to use it @@ -1262,13 +1265,14 @@ export class SessionConnection extends PersistentConnection { // if it is less than 0.9, then try to compress the whole request const middle = ~~((content.length - 1024) / 2) - const gzipped = gzipDeflate(content.subarray(middle, middle + 1024), 0.9) + const middlePart = content.subarray(middle, middle + 1024) + const gzipped = this._crypto.gzip(middlePart, Math.floor(middlePart.length * 0.9)) if (!gzipped) shouldGzip = false } if (shouldGzip) { - const gzipped = gzipDeflate(content, 0.9) + const gzipped = this._crypto.gzip(content, Math.floor(content.length * 0.9)) if (gzipped) { this.log.debug('gzipped %s (%db -> %db)', method, content.length, gzipped.length) @@ -1601,7 +1605,7 @@ export class SessionConnection extends PersistentConnection { // leave bytes for mtproto header (we'll write it later, // since we need seqno and msg_id to be larger than the content) writer.pos += 16 - writer.uint(0x73f1f8dc) // msg_container + writer.uint(MSG_CONTAINER_ID) writer.uint(messageCount) } diff --git a/packages/core/src/network/transports/obfuscated.ts b/packages/core/src/network/transports/obfuscated.ts index b9631125..b62a2f7a 100644 --- a/packages/core/src/network/transports/obfuscated.ts +++ b/packages/core/src/network/transports/obfuscated.ts @@ -1,5 +1,5 @@ import { concatBuffers, dataViewFromBuffer } from '../../utils/buffer-utils.js' -import { IEncryptionScheme, randomBytes } from '../../utils/index.js' +import { IAesCtr, randomBytes } from '../../utils/index.js' import { IPacketCodec } from './abstract.js' import { WrappedCodec } from './wrapped.js' @@ -11,8 +11,8 @@ export interface MtProxyInfo { } export class ObfuscatedPacketCodec extends WrappedCodec implements IPacketCodec { - private _encryptor?: IEncryptionScheme - private _decryptor?: IEncryptionScheme + private _encryptor?: IAesCtr + private _decryptor?: IAesCtr private _proxy?: MtProxyInfo @@ -78,31 +78,31 @@ export class ObfuscatedPacketCodec extends WrappedCodec implements IPacketCodec decryptKey = await this._crypto.sha256(concatBuffers([decryptKey, this._proxy.secret])) } - this._encryptor = await this._crypto.createAesCtr(encryptKey, encryptIv, true) - this._decryptor = await this._crypto.createAesCtr(decryptKey, decryptIv, false) + this._encryptor = this._crypto.createAesCtr(encryptKey, encryptIv, true) + this._decryptor = this._crypto.createAesCtr(decryptKey, decryptIv, false) - const encrypted = await this._encryptor.encrypt(random) + const encrypted = this._encryptor.process(random) random.set(encrypted.subarray(56, 64), 56) return random } async encode(packet: Uint8Array): Promise { - return this._encryptor!.encrypt(await this._inner.encode(packet)) + return this._encryptor!.process(await this._inner.encode(packet)) } feed(data: Uint8Array): void { - const dec = this._decryptor!.decrypt(data) + const dec = this._decryptor!.process(data) - if (ArrayBuffer.isView(dec)) this._inner.feed(dec) - else { - dec.then((dec) => this._inner.feed(dec)).catch((err) => this.emit('error', err)) - } + this._inner.feed(dec) } reset(): void { this._inner.reset() - delete this._encryptor - delete this._decryptor + this._encryptor?.close?.() + this._decryptor?.close?.() + + this._encryptor = undefined + this._decryptor = undefined } } diff --git a/packages/core/src/utils/crypto/abstract.ts b/packages/core/src/utils/crypto/abstract.ts index c3b0fde8..041c074a 100644 --- a/packages/core/src/utils/crypto/abstract.ts +++ b/packages/core/src/utils/crypto/abstract.ts @@ -1,11 +1,14 @@ import { MaybeAsync } from '../../types/index.js' -import { AesModeOfOperationIge } from './common.js' import { factorizePQSync } from './factorization.js' export interface IEncryptionScheme { - encrypt(data: Uint8Array): MaybeAsync + encrypt(data: Uint8Array): Uint8Array + decrypt(data: Uint8Array): Uint8Array +} - decrypt(data: Uint8Array): MaybeAsync +export interface IAesCtr { + process(data: Uint8Array): Uint8Array + close?(): void } export interface ICryptoProvider { @@ -25,32 +28,20 @@ export interface ICryptoProvider { hmacSha256(data: Uint8Array, key: Uint8Array): MaybeAsync - // in telegram, iv is always either used only once, or is the same for all calls for the key - createAesCtr(key: Uint8Array, iv: Uint8Array, encrypt: boolean): MaybeAsync + createAesCtr(key: Uint8Array, iv: Uint8Array, encrypt: boolean): IAesCtr - createAesIge(key: Uint8Array, iv: Uint8Array): MaybeAsync - - createAesEcb(key: Uint8Array): MaybeAsync + createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme factorizePQ(pq: Uint8Array): MaybeAsync<[Uint8Array, Uint8Array]> + + gzip(data: Uint8Array, maxSize: number): Uint8Array | null + gunzip(data: Uint8Array): Uint8Array } export abstract class BaseCryptoProvider { - createAesIge(key: Uint8Array, iv: Uint8Array): MaybeAsync { - const ecb = this.createAesEcb(key) - - if ('then' in ecb) { - return ecb.then((ecb) => new AesModeOfOperationIge(key, iv, ecb)) - } - - return new AesModeOfOperationIge(key, iv, ecb) - } - factorizePQ(pq: Uint8Array) { return factorizePQSync(pq) } - - abstract createAesEcb(key: Uint8Array): MaybeAsync } export type CryptoProviderFactory = () => ICryptoProvider diff --git a/packages/core/src/utils/crypto/common.ts b/packages/core/src/utils/crypto/common.ts deleted file mode 100644 index b8dfb83f..00000000 --- a/packages/core/src/utils/crypto/common.ts +++ /dev/null @@ -1,67 +0,0 @@ -import type { IEncryptionScheme } from './abstract.js' -import { xorBufferInPlace } from './utils.js' - -/** - * AES mode of operation IGE implementation in JS - */ -export class AesModeOfOperationIge implements IEncryptionScheme { - private _key: Uint8Array - private _iv: Uint8Array - private _aes: IEncryptionScheme - - constructor(key: Uint8Array, iv: Uint8Array, ecb: IEncryptionScheme) { - this._key = key - this._iv = iv - this._aes = ecb - } - - async encrypt(data: Uint8Array): Promise { - if (data.length % 16 !== 0) { - throw new Error('invalid plaintext size (must be multiple of 16 bytes)') - } - - const ciphertext = new Uint8Array(data.length) - let block = new Uint8Array(16) - - let iv1 = this._iv.subarray(0, 16) - let iv2 = this._iv.subarray(16, 32) - - for (let i = 0; i < data.length; i += 16) { - block.set(data.subarray(i, i + 16)) - xorBufferInPlace(block, iv1) - block = await this._aes.encrypt(block) - xorBufferInPlace(block, iv2) - ciphertext.set(block, i) - - iv1 = ciphertext.subarray(i, i + 16) - iv2 = data.subarray(i, i + 16) - } - - return ciphertext - } - - async decrypt(data: Uint8Array): Promise { - if (data.length % 16 !== 0) { - throw new Error('invalid ciphertext size (must be multiple of 16 bytes)') - } - - const plaintext = new Uint8Array(data.length) - let block = new Uint8Array(16) - - let iv1 = this._iv.subarray(16, 32) - let iv2 = this._iv.subarray(0, 16) - - for (let i = 0; i < data.length; i += 16) { - block.set(data.subarray(i, i + 16)) - xorBufferInPlace(block, iv1) - block = await this._aes.decrypt(block) - xorBufferInPlace(block, iv2) - plaintext.set(block, i) - - iv1 = plaintext.subarray(i, i + 16) - iv2 = data.subarray(i, i + 16) - } - - return plaintext - } -} diff --git a/packages/core/src/utils/crypto/node-crypto.ts b/packages/core/src/utils/crypto/node-crypto.ts deleted file mode 100644 index df94acf5..00000000 --- a/packages/core/src/utils/crypto/node-crypto.ts +++ /dev/null @@ -1,64 +0,0 @@ -// eslint-disable-next-line no-restricted-imports -import { createCipheriv, createDecipheriv, createHash, createHmac, pbkdf2 } from 'crypto' - -import { MaybeAsync } from '../../types/index.js' -import { concatBuffers } from '../buffer-utils.js' -import { BaseCryptoProvider, ICryptoProvider, IEncryptionScheme } from './abstract.js' - -export class NodeCryptoProvider extends BaseCryptoProvider implements ICryptoProvider { - createAesCtr(key: Uint8Array, iv: Uint8Array, encrypt: boolean): IEncryptionScheme { - const cipher = (encrypt ? createCipheriv : createDecipheriv)(`aes-${key.length * 8}-ctr`, key, iv) - - const update = (data: Uint8Array) => cipher.update(data) - - return { - encrypt: update, - decrypt: update, - } - } - - createAesEcb(key: Uint8Array): IEncryptionScheme { - const methodName = `aes-${key.length * 8}-ecb` - - return { - encrypt(data: Uint8Array) { - const cipher = createCipheriv(methodName, key, null) - cipher.setAutoPadding(false) - - return concatBuffers([cipher.update(data), cipher.final()]) - }, - decrypt(data: Uint8Array) { - const cipher = createDecipheriv(methodName, key, null) - cipher.setAutoPadding(false) - - return concatBuffers([cipher.update(data), cipher.final()]) - }, - } - } - - pbkdf2( - password: Uint8Array, - salt: Uint8Array, - iterations: number, - keylen = 64, - algo = 'sha512', - ): MaybeAsync { - return new Promise((resolve, reject) => - pbkdf2(password, salt, iterations, keylen, algo, (err: Error | null, buf: Uint8Array) => - err !== null ? reject(err) : resolve(buf), - ), - ) - } - - sha1(data: Uint8Array): Uint8Array { - return createHash('sha1').update(data).digest() - } - - sha256(data: Uint8Array): Uint8Array { - return createHash('sha256').update(data).digest() - } - - hmacSha256(data: Uint8Array, key: Uint8Array): MaybeAsync { - return createHmac('sha256', key).update(data).digest() - } -} diff --git a/packages/core/src/utils/crypto/node.ts b/packages/core/src/utils/crypto/node.ts new file mode 100644 index 00000000..e5d11fd1 --- /dev/null +++ b/packages/core/src/utils/crypto/node.ts @@ -0,0 +1,93 @@ +// eslint-disable-next-line no-restricted-imports +import { createCipheriv, createHash, createHmac, pbkdf2 } from 'crypto' +import { deflateSync, gunzipSync } from 'zlib' + +import { ige256Decrypt, ige256Encrypt, initAsync, InitInput } from '@mtcute/wasm' + +import { MaybeAsync } from '../../types/index.js' +import { BaseCryptoProvider, IAesCtr, ICryptoProvider, IEncryptionScheme } from './abstract.js' + +export abstract class BaseNodeCryptoProvider extends BaseCryptoProvider { + createAesCtr(key: Uint8Array, iv: Uint8Array): IAesCtr { + const cipher = createCipheriv(`aes-${key.length * 8}-ctr`, key, iv) + + const update = (data: Uint8Array) => cipher.update(data) + + return { + process: update, + } + } + + pbkdf2( + password: Uint8Array, + salt: Uint8Array, + iterations: number, + keylen = 64, + algo = 'sha512', + ): MaybeAsync { + return new Promise((resolve, reject) => + pbkdf2(password, salt, iterations, keylen, algo, (err: Error | null, buf: Uint8Array) => + err !== null ? reject(err) : resolve(buf), + ), + ) + } + + sha1(data: Uint8Array): Uint8Array { + return createHash('sha1').update(data).digest() + } + + sha256(data: Uint8Array): Uint8Array { + return createHash('sha256').update(data).digest() + } + + hmacSha256(data: Uint8Array, key: Uint8Array): Uint8Array { + return createHmac('sha256', key).update(data).digest() + } + + gzip(data: Uint8Array, maxSize: number): Uint8Array | null { + // todo: test if wasm impl is better fit here + try { + // telegram accepts both zlib and gzip, but zlib is faster and has less overhead, so we use it here + return deflateSync(data, { + maxOutputLength: maxSize, + }) + // hot path, avoid additional runtime checks + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + if (e.code === 'ERR_BUFFER_TOO_LARGE') { + return null + } + + throw e + } + } + + gunzip(data: Uint8Array): Uint8Array { + // todo: test if wasm impl is better fit here + return gunzipSync(data) + } +} + +export class NodeCryptoProvider extends BaseNodeCryptoProvider implements ICryptoProvider { + private wasmInput?: InitInput + + constructor(params?: { wasmInput?: InitInput }) { + super() + this.wasmInput = params?.wasmInput + } + + initialize(): Promise { + return initAsync(this.wasmInput) + } + + createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme { + return { + encrypt(data: Uint8Array): Uint8Array { + return ige256Encrypt(data, key, iv) + }, + decrypt(data: Uint8Array): Uint8Array { + return ige256Decrypt(data, key, iv) + }, + } + } +} diff --git a/packages/core/src/utils/crypto/subtle.ts b/packages/core/src/utils/crypto/web.ts similarity index 54% rename from packages/core/src/utils/crypto/subtle.ts rename to packages/core/src/utils/crypto/web.ts index 498f3553..89f1050f 100644 --- a/packages/core/src/utils/crypto/subtle.ts +++ b/packages/core/src/utils/crypto/web.ts @@ -1,12 +1,17 @@ +import { + createCtr256, + ctr256, + deflateMaxSize, + freeCtr256, + gunzip, + ige256Decrypt, + ige256Encrypt, + initAsync, + InitInput, +} from '@mtcute/wasm' + import { MaybeAsync } from '../../index.js' -import { BaseCryptoProvider, ICryptoProvider, IEncryptionScheme } from './abstract.js' - -import AES_, { CTR } from '@cryptography/aes' - -// fucking weird flex with es modules. -// i hate default imports please for the love of god never use them -type AES_ = typeof AES_.default -const AES = 'default' in AES_ ? AES_.default : AES_ as AES_ +import { BaseCryptoProvider, IAesCtr, ICryptoProvider, IEncryptionScheme } from './abstract.js' const ALGO_TO_SUBTLE: Record = { sha256: 'SHA-256', @@ -14,23 +19,23 @@ const ALGO_TO_SUBTLE: Record = { sha512: 'SHA-512', } -function wordsToBytes(words: Uint32Array): Uint8Array { - const o = new Uint8Array(words.byteLength) +export class WebCryptoProvider extends BaseCryptoProvider implements ICryptoProvider { + readonly subtle: SubtleCrypto + readonly wasmInput?: InitInput - const len = words.length * 4 + constructor(params?: { wasmInput?: InitInput; subtle?: SubtleCrypto }) { + super() + this.wasmInput = params?.wasmInput + const subtle = params?.subtle ?? globalThis.crypto?.subtle - for (let i = 0; i < len; ++i) { - o[i] = ((words[i >>> 2] >>> (24 - (i % 4) * 8)) & 0xff) + if (!subtle) { + throw new Error('SubtleCrypto is not available') + } + this.subtle = subtle } - return o -} - -export class SubtleCryptoProvider extends BaseCryptoProvider implements ICryptoProvider { - constructor( - readonly subtle: SubtleCrypto, - ) { - super() + initialize(): Promise { + return initAsync(this.wasmInput) } sha1(data: Uint8Array): MaybeAsync { @@ -78,21 +83,27 @@ export class SubtleCryptoProvider extends BaseCryptoProvider implements ICryptoP return new Uint8Array(res) } - createAesCtr(key: Uint8Array, iv: Uint8Array): IEncryptionScheme { - const aes = new CTR(key, iv) + createAesCtr(key: Uint8Array, iv: Uint8Array): IAesCtr { + const ctx = createCtr256(key, iv) return { - encrypt: (data) => wordsToBytes(aes.encrypt(data)), - decrypt: (data) => wordsToBytes(aes.decrypt(data)), + process: (data) => ctr256(ctx, data), + close: () => freeCtr256(ctx), } } - createAesEcb(key: Uint8Array): IEncryptionScheme { - const aes = new AES(key) - + createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme { return { - encrypt: (data) => wordsToBytes(aes.encrypt(data)), - decrypt: (data) => wordsToBytes(aes.decrypt(data)), + encrypt: (data) => ige256Encrypt(data, key, iv), + decrypt: (data) => ige256Decrypt(data, key, iv), } } + + gzip(data: Uint8Array, maxSize: number): Uint8Array | null { + return deflateMaxSize(data, maxSize) + } + + gunzip(data: Uint8Array): Uint8Array { + return gunzip(data) + } } diff --git a/packages/core/src/utils/platform/crypto.ts b/packages/core/src/utils/platform/crypto.ts index e2b8c3d6..52abfb45 100644 --- a/packages/core/src/utils/platform/crypto.ts +++ b/packages/core/src/utils/platform/crypto.ts @@ -1,4 +1,4 @@ -import { NodeCryptoProvider } from '../crypto/node-crypto.js' +import { NodeCryptoProvider } from '../crypto/node.js' /** @internal */ export const _defaultCryptoProviderFactory = () => new NodeCryptoProvider() diff --git a/packages/core/src/utils/platform/crypto.web.ts b/packages/core/src/utils/platform/crypto.web.ts index 5e792bbf..2f27c213 100644 --- a/packages/core/src/utils/platform/crypto.web.ts +++ b/packages/core/src/utils/platform/crypto.web.ts @@ -1,5 +1,5 @@ import { MtUnsupportedError } from '../../index.js' -import { SubtleCryptoProvider } from '../crypto/subtle.js' +import { WebCryptoProvider } from '../crypto/web.js' /** @internal */ export const _defaultCryptoProviderFactory = () => { @@ -7,5 +7,5 @@ export const _defaultCryptoProviderFactory = () => { throw new MtUnsupportedError('WebCrypto API is not available') } - return new SubtleCryptoProvider(crypto.subtle) + return new WebCryptoProvider({ subtle: crypto.subtle }) } diff --git a/packages/core/tests/auth-key.spec.ts b/packages/core/tests/auth-key.spec.ts index 880ca391..eccc8753 100644 --- a/packages/core/tests/auth-key.spec.ts +++ b/packages/core/tests/auth-key.spec.ts @@ -6,7 +6,7 @@ import { describe, it } from 'mocha' import { TlReaderMap } from '@mtcute/tl-runtime' import { AuthKey } from '../src/network/auth-key.js' -import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js' +import { NodeCryptoProvider } from '../src/utils/crypto/node.js' import { LogManager } from '../src/utils/index.js' chai.use(spies) diff --git a/packages/core/tests/crypto-providers.spec.ts b/packages/core/tests/crypto-providers.spec.ts index 2d0a410a..ea89492d 100644 --- a/packages/core/tests/crypto-providers.spec.ts +++ b/packages/core/tests/crypto-providers.spec.ts @@ -4,11 +4,13 @@ import { describe, it } from 'mocha' import { hexDecodeToBuffer, hexEncode, utf8EncodeToBuffer } from '@mtcute/tl-runtime' -import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js' -import { SubtleCryptoProvider } from '../src/utils/crypto/subtle.js' +import { NodeCryptoProvider } from '../src/utils/crypto/node.js' +import { WebCryptoProvider } from '../src/utils/crypto/web.js' import { ICryptoProvider } from '../src/utils/index.js' export function testCryptoProvider(c: ICryptoProvider): void { + before(() => c.initialize?.()) + it('should calculate sha1', async () => { expect(hexEncode(await c.sha1(utf8EncodeToBuffer('')))).to.eq('da39a3ee5e6b4b0d3255bfef95601890afd80709') expect(hexEncode(await c.sha1(utf8EncodeToBuffer('hello')))).to.eq('aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d') @@ -47,81 +49,51 @@ export function testCryptoProvider(c: ICryptoProvider): void { ) }) - it('should encrypt and decrypt aes-ctr', async () => { - let aes = await c.createAesCtr( + it('should encrypt and decrypt aes-ctr', () => { + let aes = c.createAesCtr( hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'), hexDecodeToBuffer('0182de2bd789c295c3c6c875c5e9e190'), true, ) const data = hexDecodeToBuffer('7baae571e4c2f4cfadb1931d5923aca7') - expect(hexEncode(await aes.encrypt(data))).eq('df5647dbb70bc393f2fb05b72f42286f') - expect(hexEncode(await aes.encrypt(data))).eq('3917147082672516b3177150129bc579') - expect(hexEncode(await aes.encrypt(data))).eq('2a7a9089270a5de45d5e3dd399cac725') - expect(hexEncode(await aes.encrypt(data))).eq('56d085217771398ac13583de4d677dd8') - expect(hexEncode(await aes.encrypt(data))).eq('cc639b488126cf36e79c4515e8012b92') - expect(hexEncode(await aes.encrypt(data))).eq('01384d100646cd562cc5586ec3f8f8c4') + expect(hexEncode(aes.process(data))).eq('df5647dbb70bc393f2fb05b72f42286f') + expect(hexEncode(aes.process(data))).eq('3917147082672516b3177150129bc579') + expect(hexEncode(aes.process(data))).eq('2a7a9089270a5de45d5e3dd399cac725') + expect(hexEncode(aes.process(data))).eq('56d085217771398ac13583de4d677dd8') + expect(hexEncode(aes.process(data))).eq('cc639b488126cf36e79c4515e8012b92') + expect(hexEncode(aes.process(data))).eq('01384d100646cd562cc5586ec3f8f8c4') - aes = await c.createAesCtr( + aes.close?.() + aes = c.createAesCtr( hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'), hexDecodeToBuffer('0182de2bd789c295c3c6c875c5e9e190'), false, ) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('df5647dbb70bc393f2fb05b72f42286f')))).eq(hexEncode(data)) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('3917147082672516b3177150129bc579')))).eq(hexEncode(data)) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('2a7a9089270a5de45d5e3dd399cac725')))).eq(hexEncode(data)) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('56d085217771398ac13583de4d677dd8')))).eq(hexEncode(data)) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('cc639b488126cf36e79c4515e8012b92')))).eq(hexEncode(data)) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('01384d100646cd562cc5586ec3f8f8c4')))).eq(hexEncode(data)) + expect(hexEncode(aes.process(hexDecodeToBuffer('df5647dbb70bc393f2fb05b72f42286f')))).eq(hexEncode(data)) + expect(hexEncode(aes.process(hexDecodeToBuffer('3917147082672516b3177150129bc579')))).eq(hexEncode(data)) + expect(hexEncode(aes.process(hexDecodeToBuffer('2a7a9089270a5de45d5e3dd399cac725')))).eq(hexEncode(data)) + expect(hexEncode(aes.process(hexDecodeToBuffer('56d085217771398ac13583de4d677dd8')))).eq(hexEncode(data)) + expect(hexEncode(aes.process(hexDecodeToBuffer('cc639b488126cf36e79c4515e8012b92')))).eq(hexEncode(data)) + expect(hexEncode(aes.process(hexDecodeToBuffer('01384d100646cd562cc5586ec3f8f8c4')))).eq(hexEncode(data)) + + aes.close?.() }) - it('should encrypt and decrypt aes-ecb', async () => { - let aes = await c.createAesEcb( - hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'), - ) - - expect(hexEncode(await aes.encrypt(hexDecodeToBuffer('f71eed6018f1ef976d39c19f9d29fd29')))).eq( - '038ef30acb438b64159f484aec541fd2', - ) - expect(hexEncode(await aes.encrypt(hexDecodeToBuffer('f71eed6018f1ef976d39c19f9d29fd29')))).eq( - '038ef30acb438b64159f484aec541fd2', - ) - expect(hexEncode(await aes.encrypt(hexDecodeToBuffer('460af382084b7960d2e9f3bca4cdc25b')))).eq( - '29c3af710c3c56f7fbb97ca06af3b974', - ) - - aes = await c.createAesEcb( - hexDecodeToBuffer('d450aae0bf0060a4af1044886b42a13f7c506b35255d134a7e87ab3f23a9493b'), - ) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('038ef30acb438b64159f484aec541fd2')))).eq( - 'f71eed6018f1ef976d39c19f9d29fd29', - ) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('038ef30acb438b64159f484aec541fd2')))).eq( - 'f71eed6018f1ef976d39c19f9d29fd29', - ) - expect(hexEncode(await aes.decrypt(hexDecodeToBuffer('29c3af710c3c56f7fbb97ca06af3b974')))).eq( - '460af382084b7960d2e9f3bca4cdc25b', - ) - }) - - it('should encrypt and decrypt aes-ige', async () => { - const aes = await c.createAesIge( + it('should encrypt and decrypt aes-ige', () => { + const aes = c.createAesIge( hexDecodeToBuffer('5468697320697320616E20696D706C655468697320697320616E20696D706C65'), hexDecodeToBuffer('6D656E746174696F6E206F6620494745206D6F646520666F72204F70656E5353'), ) expect( hexEncode( - await aes.encrypt( - hexDecodeToBuffer('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b'), - ), + aes.encrypt(hexDecodeToBuffer('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b')), ), ).to.eq('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69') expect( hexEncode( - await aes.decrypt( - hexDecodeToBuffer('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69'), - ), + aes.decrypt(hexDecodeToBuffer('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69')), ), ).to.eq('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b') }) @@ -137,12 +109,12 @@ describe('NodeCryptoProvider', () => { testCryptoProvider(new NodeCryptoProvider()) }) -describe('SubtleCryptoProvider', () => { +describe('WebCryptoProvider', () => { if (typeof crypto.subtle === 'undefined') { - console.warn('Skipping SubtleCryptoProvider tests') + console.warn('Skipping WebCryptoProvider tests') return } - testCryptoProvider(new SubtleCryptoProvider(crypto.subtle)) + testCryptoProvider(new WebCryptoProvider({ subtle: crypto.subtle })) }) diff --git a/packages/core/tests/keys.spec.ts b/packages/core/tests/keys.spec.ts index a85415cc..ae912513 100644 --- a/packages/core/tests/keys.spec.ts +++ b/packages/core/tests/keys.spec.ts @@ -1,7 +1,7 @@ import { expect } from 'chai' import { describe, it } from 'mocha' -import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js' +import { NodeCryptoProvider } from '../src/utils/crypto/node.js' import { parsePublicKey } from '../src/utils/index.js' const crypto = new NodeCryptoProvider() diff --git a/packages/core/tests/mtproto-crypto.spec.ts b/packages/core/tests/mtproto-crypto.spec.ts index c21340f2..1ded2b61 100644 --- a/packages/core/tests/mtproto-crypto.spec.ts +++ b/packages/core/tests/mtproto-crypto.spec.ts @@ -9,7 +9,7 @@ import { createAesIgeForMessageOld, generateKeyAndIvFromNonce, } from '../src/utils/crypto/mtproto.js' -import { NodeCryptoProvider } from '../src/utils/crypto/node-crypto.js' +import { NodeCryptoProvider } from '../src/utils/crypto/node.js' chai.use(spies) diff --git a/packages/crypto-node/.gitignore b/packages/crypto-node/.gitignore index 1a53b9f0..a17106b9 100644 --- a/packages/crypto-node/.gitignore +++ b/packages/crypto-node/.gitignore @@ -1,2 +1,2 @@ .vs -build +build \ No newline at end of file diff --git a/packages/crypto-node/src/index.ts b/packages/crypto-node/src/index.ts index 21521222..53c39f41 100644 --- a/packages/crypto-node/src/index.ts +++ b/packages/crypto-node/src/index.ts @@ -1,4 +1,4 @@ -import { NodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node-crypto.js' +import { BaseNodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node.js' import { IEncryptionScheme } from '@mtcute/core/utils.js' import { native } from './native.cjs' @@ -13,7 +13,7 @@ const { ige256_decrypt, ige256_encrypt } = native * Other modes are supported natively by OpenSSL, and * they *are* faster than the custom ones. */ -export class NodeNativeCryptoProvider extends NodeCryptoProvider { +export class NodeNativeCryptoProvider extends BaseNodeCryptoProvider { createAesIge(key: Uint8Array, iv: Uint8Array): IEncryptionScheme { return { encrypt(data: Uint8Array): Uint8Array { diff --git a/packages/crypto/README.md b/packages/crypto/README.md deleted file mode 100644 index c2d2a6e9..00000000 --- a/packages/crypto/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# This is not a package -This is just a bunch of C files that are imported in `crypto-*` packages. - -## Acknowledgements -This code is based on [pyrogram/tgcrypto](https://github.com/pyrogram/tgcrypto) \ No newline at end of file diff --git a/packages/crypto/cbc256.c b/packages/crypto/cbc256.c deleted file mode 100644 index 0077c73d..00000000 --- a/packages/crypto/cbc256.c +++ /dev/null @@ -1,36 +0,0 @@ -#include "aes256.h" - -void cbc256_encrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out) { - uint32_t expandedKey[EXPANDED_KEY_SIZE]; - uint32_t i, j; - - uint8_t* currentIv = iv; - - aes256_set_encryption_key(key, expandedKey); - - for (i = 0; i < length; i += AES_BLOCK_SIZE) { - for (j = 0; j < AES_BLOCK_SIZE; ++j) - out[i + j] = in[i + j] ^ currentIv[j]; - - aes256_encrypt(&out[i], &out[i], expandedKey); - currentIv = &out[i]; - } -} - -void cbc256_decrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out) { - uint32_t expandedKey[EXPANDED_KEY_SIZE]; - uint32_t i, j; - - uint8_t* currentIv = iv; - - aes256_set_decryption_key(key, expandedKey); - - for (i = 0; i < length; i += AES_BLOCK_SIZE) { - aes256_decrypt(&in[i], &out[i], expandedKey); - - for (j = 0; j < AES_BLOCK_SIZE; ++j) - out[i + j] ^= currentIv[j]; - - currentIv = &in[i]; - } -} \ No newline at end of file diff --git a/packages/crypto/cbc256.h b/packages/crypto/cbc256.h deleted file mode 100644 index b49179fa..00000000 --- a/packages/crypto/cbc256.h +++ /dev/null @@ -1,17 +0,0 @@ -#include - -#ifndef CBC256_H -#define CBC256_H - -#ifdef __cplusplus -extern "C" { -#endif - -void cbc256_encrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out); -void cbc256_decrypt(uint8_t* in, size_t length, uint8_t* key, uint8_t* iv, uint8_t* out); - -#ifdef __cplusplus -} -#endif - -#endif // CBC256_H diff --git a/packages/crypto/ctr256.c b/packages/crypto/ctr256.c deleted file mode 100644 index bca6a194..00000000 --- a/packages/crypto/ctr256.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "aes256.h" - -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - -void ctr256(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* counter, uint8_t* out) { - uint8_t chunk[AES_BLOCK_SIZE]; - uint32_t expandedKey[EXPANDED_KEY_SIZE]; - uint32_t i, j, k; - - memcpy(out, in, length); - aes256_set_encryption_key(key, expandedKey); - - aes256_encrypt(iv, chunk, expandedKey); - - for (i = 0; i < length; i += AES_BLOCK_SIZE) { - for (j = 0; j < MIN(length - i, AES_BLOCK_SIZE); ++j) { - out[i + j] ^= chunk[(*counter)++]; - - if (*counter >= AES_BLOCK_SIZE) - *counter = 0; - - if (*counter == 0) { - k = AES_BLOCK_SIZE; - while(k--) - if (++iv[k]) - break; - - aes256_encrypt(iv, chunk, expandedKey); - } - } - } -} diff --git a/packages/crypto/ctr256.h b/packages/crypto/ctr256.h deleted file mode 100644 index c74065f7..00000000 --- a/packages/crypto/ctr256.h +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#ifndef CTR256_H -#define CTR256_H - -extern "C" uint8_t* ctr256(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* state, uint8_t* out); - -#endif // CTR256_H diff --git a/packages/tl-runtime/package.json b/packages/tl-runtime/package.json index f12a6802..b1f09465 100644 --- a/packages/tl-runtime/package.json +++ b/packages/tl-runtime/package.json @@ -17,9 +17,7 @@ "./cjs/encodings/hex.js": "./cjs/encodings/hex.web.js", "./esm/encodings/hex.js": "./esm/encodings/hex.web.js", "./cjs/encodings/utf8.js": "./cjs/encodings/utf8.web.js", - "./esm/encodings/utf8.js": "./esm/encodings/utf8.web.js", - "./cjs/encodings/gzip.js": "./cjs/encodings/gzip.web.js", - "./esm/encodings/gzip.js": "./esm/encodings/gzip.web.js" + "./esm/encodings/utf8.js": "./esm/encodings/utf8.web.js" }, "distOnlyFields": { "exports": { @@ -31,10 +29,6 @@ }, "main": "src/index.ts", "dependencies": { - "long": "5.2.3", - "pako": "2.1.0" - }, - "devDependencies": { - "@types/pako": "2.0.0" + "long": "5.2.3" } } diff --git a/packages/tl-runtime/src/encodings/gzip.ts b/packages/tl-runtime/src/encodings/gzip.ts deleted file mode 100644 index f865052b..00000000 --- a/packages/tl-runtime/src/encodings/gzip.ts +++ /dev/null @@ -1,39 +0,0 @@ -/* eslint-disable no-restricted-globals */ - -import { deflateSync, gunzipSync } from 'node:zlib' - -/** - * Decompress a buffer with gzip. - * @param buf Buffer to decompress - */ -export function gzipInflate(buf: Uint8Array): Uint8Array { - return gunzipSync(buf) -} - -/** - * Compress a buffer with gzip. - * - * @param buf Buffer to compress - * @param maxRatio - * Maximum compression ratio. If the resulting buffer is smaller than - * `buf.length * ratio`, `null` is returned. - */ -export function gzipDeflate(buf: ArrayBuffer, maxRatio?: number): Buffer | null { - if (maxRatio) { - try { - return deflateSync(buf, { - maxOutputLength: Math.floor(buf.byteLength * maxRatio), - }) - // hot path, avoid additional runtime checks - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - if (e.code === 'ERR_BUFFER_TOO_LARGE') { - return null - } - - throw e - } - } - - return deflateSync(buf) -} diff --git a/packages/tl-runtime/src/encodings/gzip.web.ts b/packages/tl-runtime/src/encodings/gzip.web.ts deleted file mode 100644 index 797f0d0f..00000000 --- a/packages/tl-runtime/src/encodings/gzip.web.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { Data, Deflate, inflate } from 'pako' - -export function gzipInflate(buf: Uint8Array): Uint8Array { - return inflate(buf) -} - -const ERROR_SIZE_LIMIT_REACHED = 'ERR_SIZE_LIMIT_REACHED' - -class DeflateLimited extends Deflate { - constructor(readonly limit: number) { - super() - } - - _size = 0 - - onData(chunk: Data) { - this._size += (chunk as Uint8Array).length - - if (this._size > this.limit) { - // caught locally - // eslint-disable-next-line @typescript-eslint/no-throw-literal - throw ERROR_SIZE_LIMIT_REACHED - } - - super.onData(chunk) - } -} - -export function gzipDeflate(buf: Uint8Array, maxRatio?: number): Uint8Array | null { - const deflator = maxRatio ? new DeflateLimited(Math.floor(buf.length * maxRatio)) : new Deflate() - - try { - deflator.push(buf, true) - } catch (e) { - if (e === ERROR_SIZE_LIMIT_REACHED) return null - throw e - } - - return deflator.result -} diff --git a/packages/tl-runtime/src/encodings/index.ts b/packages/tl-runtime/src/encodings/index.ts index 3c14a235..ce22da50 100644 --- a/packages/tl-runtime/src/encodings/index.ts +++ b/packages/tl-runtime/src/encodings/index.ts @@ -1,4 +1,3 @@ export * from './base64.js' -export * from './gzip.js' export * from './hex.js' export * from './utf8.js' diff --git a/packages/tl-runtime/src/index.ts b/packages/tl-runtime/src/index.ts index b112fec0..def86778 100644 --- a/packages/tl-runtime/src/index.ts +++ b/packages/tl-runtime/src/index.ts @@ -1,5 +1,4 @@ export * from './encodings/base64.js' -export * from './encodings/gzip.js' export * from './encodings/hex.js' export * from './encodings/utf8.js' export * from './reader.js' diff --git a/packages/tl-runtime/src/reader.ts b/packages/tl-runtime/src/reader.ts index a2f2ca0e..faa247c8 100644 --- a/packages/tl-runtime/src/reader.ts +++ b/packages/tl-runtime/src/reader.ts @@ -1,6 +1,5 @@ import Long from 'long' -import { gzipInflate } from './encodings/gzip.js' import { hexEncode } from './encodings/hex.js' import { utf8Decode } from './encodings/utf8.js' @@ -178,13 +177,10 @@ export class TlBinaryReader { return utf8Decode(this.bytes()) } - object(): unknown { - const id = this.uint() - + object(id = this.uint()): unknown { if (id === 0x1cb5c415 /* vector */) { return this.vector(this.object, true) } - if (id === 0x3072cfa1 /* gzip_packed */) return this.gzip() if (id === 0xbc799737 /* boolFalse */) return false if (id === 0x997275b5 /* boolTrue */) return true // unsure if it is actually used in the wire, seems like it's only used for boolean flags @@ -209,10 +205,6 @@ export class TlBinaryReader { return reader(this) } - gzip(): unknown { - return new TlBinaryReader(this.objectsMap, gzipInflate(this.bytes())).object() - } - vector(reader = this.object, bare = false): unknown[] { if (!bare) { const uint = this.uint() diff --git a/packages/tl/scripts/gen-rsa-keys.ts b/packages/tl/scripts/gen-rsa-keys.ts index 05dbab3b..e5b6cd85 100644 --- a/packages/tl/scripts/gen-rsa-keys.ts +++ b/packages/tl/scripts/gen-rsa-keys.ts @@ -3,7 +3,7 @@ import { writeFile } from 'fs/promises' import { join } from 'path' import readline from 'readline' -import { NodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node-crypto.js' +import { NodeCryptoProvider } from '@mtcute/core/src/utils/crypto/node.js' import { parsePublicKey } from '@mtcute/core/utils.js' import { TlPublicKey } from '../binary/rsa-keys.js' diff --git a/packages/wasm/.gitignore b/packages/wasm/.gitignore new file mode 100644 index 00000000..c795b054 --- /dev/null +++ b/packages/wasm/.gitignore @@ -0,0 +1 @@ +build \ No newline at end of file diff --git a/packages/wasm/README.md b/packages/wasm/README.md new file mode 100644 index 00000000..ea528ffb --- /dev/null +++ b/packages/wasm/README.md @@ -0,0 +1,19 @@ +# @mtcute/wasm + +📖 [API Reference](https://ref.mtcute.dev/modules/_mtcute_wasm.html) + +Highly optimized for size & speed WASM implementation of common algorithms used in Telegram. + +## Features +- **Super lightweight**: Only 45 KB raw, 22 KB gzipped +- **Blazingly fast**: Up to 10x faster than pure JS implementations +- Implements AES IGE and Deflate (zlib compression + gunzip), which are not available in some environments (e.g. web) + +## Acknowledgements +- Deflate is implemented through a modified version of [libdeflate](https://github.com/ebiggers/libdeflate), MIT license. + - Modified by [kamillaova](https://github.com/kamillaova) to support WASM and improve bundle size +- AES IGE code is mostly based on [tgcrypto](https://github.com/pyrogram/tgcrypto), LGPL-3.0 license. + - To comply with LGPL-3.0, the source code of the modified tgcrypto is available [here](./lib/crypto/) under LGPL-3.0 license. + +## Benchmarks +See https://github.com/mtcute/benchmarks \ No newline at end of file diff --git a/packages/wasm/build.config.cjs b/packages/wasm/build.config.cjs new file mode 100644 index 00000000..a8c383ce --- /dev/null +++ b/packages/wasm/build.config.cjs @@ -0,0 +1,20 @@ +// /* eslint-disable no-console */ +// import * cp from 'child_process' +// import * as fs from 'fs' +// import { join } from 'path' + +// const root = new URL('.', import.meta.url).pathname + +module.exports = ({ path: { join }, fs, outDir, packageDir, transformFile }) => ({ + esmOnlyDirectives: true, + final() { + const fixWasmPath = (path) => { + transformFile(join(outDir, path), (data) => data.replace('../lib/mtcute.wasm', '../mtcute.wasm')) + } + + fixWasmPath('cjs/init.js') + fixWasmPath('esm/init.js') + + fs.cpSync(join(packageDir, 'lib/mtcute.wasm'), join(outDir, 'mtcute.wasm')) + }, +}) diff --git a/packages/wasm/lib/Dockerfile b/packages/wasm/lib/Dockerfile new file mode 100644 index 00000000..6ccbe96e --- /dev/null +++ b/packages/wasm/lib/Dockerfile @@ -0,0 +1,14 @@ +FROM alpine:3.18.4 AS build + +WORKDIR /src + +RUN apk add --no-cache lld make clang16 binaryen + +COPY crypto /src/crypto +COPY libdeflate /src/libdeflate +COPY *.h *.c Makefile /src/ + +RUN ZLIB_COMPRESSION_API=1 GZIP_DECOMPRESSION_API=1 IGE_API=1 CTR_API=1 make + +FROM scratch AS binaries +COPY --from=build /src/mtcute.wasm / diff --git a/packages/wasm/lib/Makefile b/packages/wasm/lib/Makefile new file mode 100644 index 00000000..1a403a01 --- /dev/null +++ b/packages/wasm/lib/Makefile @@ -0,0 +1,75 @@ +.PHONY: all clean + +DEFAULT_API ?= 0 + +DEFLATE_COMPRESSION_API ?= $(DEFAULT_API) +DEFLATE_DECOMPRESSION_API ?= $(DEFAULT_API) +GZIP_COMPRESSION_API ?= $(DEFAULT_API) +GZIP_DECOMPRESSION_API ?= $(DEFAULT_API) +ZLIB_COMPRESSION_API ?= $(DEFAULT_API) +ZLIB_DECOMPRESSION_API ?= $(DEFAULT_API) +CRC32_API ?= $(DEFAULT_API) +ADLER32_API ?= $(DEFAULT_API) +IGE_API ?= $(DEFAULT_API) +CTR_API ?= $(DEFAULT_API) + +CRC32 ?= 0 + +LOGGING ?= 0 + +_DEFLATE_COMPRESSION := 1 +_DEFLATE_DECOMPRESSION := 1 +_ADLER32 := $(findstring 1, $(ZLIB_COMPRESSION_API)$(ZLIB_DECOMPRESSION_API)) +_AES := $(findstring 1, $(IGE_API)$(CTR_API)) + +SOURCES = utils.c \ + $(if $(filter 1, $(_DEFLATE_COMPRESSION)), libdeflate/deflate_compress.c) \ + $(if $(filter 1, $(_DEFLATE_DECOMPRESSION)), libdeflate/deflate_decompress.c) \ + $(if $(filter 1, $(GZIP_COMPRESSION_API)), libdeflate/gzip_compress.c) \ + $(if $(filter 1, $(GZIP_DECOMPRESSION_API)), libdeflate/gzip_decompress.c) \ + $(if $(filter 1, $(ZLIB_COMPRESSION_API)), libdeflate/zlib_compress.c) \ + $(if $(filter 1, $(ZLIB_DECOMPRESSION_API)), libdeflate/zlib_decompress.c) \ + $(if $(filter 1, $(CRC32)), libdeflate/crc32.c) \ + $(if $(filter 1, $(_ADLER32)), libdeflate/adler32.c) \ + $(if $(filter 1, $(_AES)), crypto/aes256.c) \ + $(if $(filter 1, $(IGE_API)), crypto/ige256.c) \ + $(if $(filter 1, $(CTR_API)), crypto/ctr256.c) + +CC := clang +CFLAGS_WASM := \ + -target wasm32-unknown-unknown \ + -nostdlib -ffreestanding -DFREESTANDING \ + $(if $(filter 1, $(LOGGING)), -DLOGGING) \ + -mbulk-memory \ + -Wl,--no-entry,--export-dynamic,--lto-O3 + +CFLAGS := $(CFLAGS_WASM) \ + -O3 \ + -Qn \ + -DNDEBUG \ + -mno-exception-handling \ + -fdelete-null-pointer-checks \ + -fno-stack-protector \ + -flto=full \ + -fdata-sections \ + -ffunction-sections \ + -Wl,--gc-sections \ + -fno-inline \ + -fno-unroll-loops + +ifneq ($(OS),Windows_NT) + UNAME_S := $(shell uname -s) + ifeq ($(UNAME_S),Darwin) + export PATH := /opt/homebrew/opt/llvm/bin/:$(PATH) + endif +endif + +OUT := mtcute.wasm + +$(OUT): $(SOURCES) + $(CC) $(CFLAGS) -I . -o $@ $^ + +clean: + rm -f $(OUT) + +all: $(OUT) diff --git a/packages/wasm/lib/common_defs.h b/packages/wasm/lib/common_defs.h new file mode 100644 index 00000000..ce3eaf74 --- /dev/null +++ b/packages/wasm/lib/common_defs.h @@ -0,0 +1,686 @@ +/* + * common_defs.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef COMMON_DEFS_H +#define COMMON_DEFS_H + +#include "libdeflate.h" + +#include +#include /* for size_t */ +#include +#ifdef _MSC_VER +# include /* for _BitScan*() and other intrinsics */ +# include /* for _byteswap_*() */ + /* Disable MSVC warnings that are expected. */ + /* /W2 */ +# pragma warning(disable : 4146) /* unary minus on unsigned type */ + /* /W3 */ +# pragma warning(disable : 4018) /* signed/unsigned mismatch */ +# pragma warning(disable : 4244) /* possible loss of data */ +# pragma warning(disable : 4267) /* possible loss of precision */ +# pragma warning(disable : 4310) /* cast truncates constant value */ + /* /W4 */ +# pragma warning(disable : 4100) /* unreferenced formal parameter */ +# pragma warning(disable : 4127) /* conditional expression is constant */ +# pragma warning(disable : 4189) /* local variable initialized but not referenced */ +# pragma warning(disable : 4232) /* nonstandard extension used */ +# pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */ +# pragma warning(disable : 4295) /* array too small to include terminating null */ +#endif + +/* ========================================================================== */ +/* Target architecture */ +/* ========================================================================== */ + +/* If possible, define a compiler-independent ARCH_* macro. */ +#undef ARCH_X86_64 +#undef ARCH_X86_32 +#undef ARCH_ARM64 +#undef ARCH_ARM32 +#ifdef _MSC_VER +# if defined(_M_X64) +# define ARCH_X86_64 +# elif defined(_M_IX86) +# define ARCH_X86_32 +# elif defined(_M_ARM64) +# define ARCH_ARM64 +# elif defined(_M_ARM) +# define ARCH_ARM32 +# endif +#else +# if defined(__x86_64__) +# define ARCH_X86_64 +# elif defined(__i386__) +# define ARCH_X86_32 +# elif defined(__aarch64__) +# define ARCH_ARM64 +# elif defined(__arm__) +# define ARCH_ARM32 +# endif +#endif + +/* ========================================================================== */ +/* Type definitions */ +/* ========================================================================== */ + +/* Fixed-width integer types */ +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +/* ssize_t, if not available in */ +#ifdef _MSC_VER +# ifdef _WIN64 + typedef long long ssize_t; +# else + typedef long ssize_t; +# endif +#endif + +/* + * Word type of the target architecture. Use 'size_t' instead of + * 'unsigned long' to account for platforms such as Windows that use 32-bit + * 'unsigned long' on 64-bit architectures. + */ +typedef size_t machine_word_t; + +/* Number of bytes in a word */ +#define WORDBYTES ((int)sizeof(machine_word_t)) + +/* Number of bits in a word */ +#define WORDBITS (8 * WORDBYTES) + +/* ========================================================================== */ +/* Optional compiler features */ +/* ========================================================================== */ + +/* Compiler version checks. Only use when absolutely necessary. */ +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +# define GCC_PREREQ(major, minor) \ + (__GNUC__ > (major) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define GCC_PREREQ(major, minor) 0 +#endif +#ifdef __clang__ +# ifdef __apple_build_version__ +# define CLANG_PREREQ(major, minor, apple_version) \ + (__apple_build_version__ >= (apple_version)) +# else +# define CLANG_PREREQ(major, minor, apple_version) \ + (__clang_major__ > (major) || \ + (__clang_major__ == (major) && __clang_minor__ >= (minor))) +# endif +#else +# define CLANG_PREREQ(major, minor, apple_version) 0 +#endif + +/* + * Macros to check for compiler support for attributes and builtins. clang + * implements these macros, but gcc doesn't, so generally any use of one of + * these macros must also be combined with a gcc version check. + */ +#ifndef __has_attribute +# define __has_attribute(attribute) 0 +#endif +#ifndef __has_builtin +# define __has_builtin(builtin) 0 +#endif + +/* + * restrict - hint that writes only occur through the given pointer. + * + * Don't use MSVC's __restrict, since it has nonstandard behavior. + * Standard restrict is okay, if it is supported. + */ +#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L) +# if defined(__GNUC__) || defined(__clang__) +# define restrict __restrict__ +# else +# define restrict +# endif +#endif /* else assume 'restrict' is usable as-is */ + +/* likely(expr) - hint that an expression is usually true */ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) +# define likely(expr) __builtin_expect(!!(expr), 1) +#else +# define likely(expr) (expr) +#endif + +/* unlikely(expr) - hint that an expression is usually false */ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) +# define unlikely(expr) __builtin_expect(!!(expr), 0) +#else +# define unlikely(expr) (expr) +#endif + +/* prefetchr(addr) - prefetch into L1 cache for read */ +#undef prefetchr +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) +# define prefetchr(addr) __builtin_prefetch((addr), 0) +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchr(addr) _mm_prefetch((addr), _MM_HINT_T0) +# elif defined(ARCH_ARM64) +# define prefetchr(addr) __prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchr(addr) __prefetch(addr) +# endif +#endif +#ifndef prefetchr +# define prefetchr(addr) +#endif + +/* prefetchw(addr) - prefetch into L1 cache for write */ +#undef prefetchw +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) +# define prefetchw(addr) __builtin_prefetch((addr), 1) +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchw(addr) _m_prefetchw(addr) +# elif defined(ARCH_ARM64) +# define prefetchw(addr) __prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchw(addr) __prefetchw(addr) +# endif +#endif +#ifndef prefetchw +# define prefetchw(addr) +#endif + +/* + * _aligned_attribute(n) - declare that the annotated variable, or variables of + * the annotated type, must be aligned on n-byte boundaries. + */ +#undef _aligned_attribute +#if defined(__GNUC__) || __has_attribute(aligned) +# define _aligned_attribute(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +# define _aligned_attribute(n) __declspec(align(n)) +#endif + +/* + * _target_attribute(attrs) - override the compilation target for a function. + * + * This accepts one or more comma-separated suffixes to the -m prefix jointly + * forming the name of a machine-dependent option. On gcc-like compilers, this + * enables codegen for the given targets, including arbitrary compiler-generated + * code as well as the corresponding intrinsics. On other compilers this macro + * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway. + */ +#if GCC_PREREQ(4, 4) || __has_attribute(target) +# define _target_attribute(attrs) __attribute__((target(attrs))) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1 +#else +# define _target_attribute(attrs) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 +#endif + +/* ========================================================================== */ +/* Miscellaneous macros */ +/* ========================================================================== */ + +#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) +#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1)) +#define ROUND_UP(n, d) ((d) * DIV_ROUND_UP((n), (d))) + +/* ========================================================================== */ +/* Endianness handling */ +/* ========================================================================== */ + +/* + * CPU_IS_LITTLE_ENDIAN() - 1 if the CPU is little endian, or 0 if it is big + * endian. When possible this is a compile-time macro that can be used in + * preprocessor conditionals. As a fallback, a generic method is used that + * can't be used in preprocessor conditionals but should still be optimized out. + */ +#if defined(__BYTE_ORDER__) /* gcc v4.6+ and clang */ +# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#elif defined(_MSC_VER) +# define CPU_IS_LITTLE_ENDIAN() true +#else +static bool CPU_IS_LITTLE_ENDIAN(void) +{ + union { + u32 w; + u8 b; + } u; + + u.w = 1; + return u.b; +} +#endif + +/* bswap16(v) - swap the bytes of a 16-bit integer */ +static u16 bswap16(u16 v) +{ +#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) + return __builtin_bswap16(v); +#elif defined(_MSC_VER) + return _byteswap_ushort(v); +#else + return (v << 8) | (v >> 8); +#endif +} + +/* bswap32(v) - swap the bytes of a 32-bit integer */ +static u32 bswap32(u32 v) +{ +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) + return _byteswap_ulong(v); +#else + return ((v & 0x000000FF) << 24) | + ((v & 0x0000FF00) << 8) | + ((v & 0x00FF0000) >> 8) | + ((v & 0xFF000000) >> 24); +#endif +} + +/* bswap64(v) - swap the bytes of a 64-bit integer */ +static u64 bswap64(u64 v) +{ +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) + return _byteswap_uint64(v); +#else + return ((v & 0x00000000000000FF) << 56) | + ((v & 0x000000000000FF00) << 40) | + ((v & 0x0000000000FF0000) << 24) | + ((v & 0x00000000FF000000) << 8) | + ((v & 0x000000FF00000000) >> 8) | + ((v & 0x0000FF0000000000) >> 24) | + ((v & 0x00FF000000000000) >> 40) | + ((v & 0xFF00000000000000) >> 56); +#endif +} + +#define le16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap16(v)) +#define le32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap32(v)) +#define le64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap64(v)) +#define be16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap16(v) : (v)) +#define be32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap32(v) : (v)) +#define be64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap64(v) : (v)) + +/* ========================================================================== */ +/* Unaligned memory accesses */ +/* ========================================================================== */ + +/* + * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed + * efficiently on the target platform, otherwise 0. + */ +#if (defined(__GNUC__) || defined(__clang__)) && \ + (defined(ARCH_X86_64) || defined(ARCH_X86_32) || \ + defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \ + /* + * For all compilation purposes, WebAssembly behaves like any other CPU + * instruction set. Even though WebAssembly engine might be running on + * top of different actual CPU architectures, the WebAssembly spec + * itself permits unaligned access and it will be fast on most of those + * platforms, and simulated at the engine level on others, so it's + * worth treating it as a CPU architecture with fast unaligned access. + */ defined(__wasm__)) +# define UNALIGNED_ACCESS_IS_FAST 1 +#elif defined(_MSC_VER) +# define UNALIGNED_ACCESS_IS_FAST 1 +#else +# define UNALIGNED_ACCESS_IS_FAST 0 +#endif + +/* + * Implementing unaligned memory accesses using memcpy() is portable, and it + * usually gets optimized appropriately by modern compilers. I.e., each + * memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled to a load or store + * instruction, not to an actual function call. + * + * We no longer use the "packed struct" approach to unaligned accesses, as that + * is nonstandard, has unclear semantics, and doesn't receive enough testing + * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994). + * + * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception + * where memcpy() generates inefficient code + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer + * consider that one case important enough to maintain different code for. + * If you run into it, please just use a newer version of gcc (or use clang). + */ + +/* Unaligned loads and stores without endianness conversion */ + +#define DEFINE_UNALIGNED_TYPE(type) \ +static type \ +load_##type##_unaligned(const void *p) \ +{ \ + type v; \ + \ + __builtin_memcpy(&v, p, sizeof(v)); \ + return v; \ +} \ + \ +static void \ +store_##type##_unaligned(type v, void *p) \ +{ \ + __builtin_memcpy(p, &v, sizeof(v)); \ +} + +DEFINE_UNALIGNED_TYPE(u16) +DEFINE_UNALIGNED_TYPE(u32) +DEFINE_UNALIGNED_TYPE(u64) +DEFINE_UNALIGNED_TYPE(machine_word_t) + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +/* Unaligned loads with endianness conversion */ + +static u16 +get_unaligned_le16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[1] << 8) | p[0]; +} + +static u16 +get_unaligned_be16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[0] << 8) | p[1]; +} + +static u32 +get_unaligned_le32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[3] << 24) | ((u32)p[2] << 16) | + ((u32)p[1] << 8) | p[0]; +} + +static u32 +get_unaligned_be32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[0] << 24) | ((u32)p[1] << 16) | + ((u32)p[2] << 8) | p[3]; +} + +static u64 +get_unaligned_le64(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le64_bswap(load_u64_unaligned(p)); + else + return ((u64)p[7] << 56) | ((u64)p[6] << 48) | + ((u64)p[5] << 40) | ((u64)p[4] << 32) | + ((u64)p[3] << 24) | ((u64)p[2] << 16) | + ((u64)p[1] << 8) | p[0]; +} + +static machine_word_t +get_unaligned_leword(const u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return get_unaligned_le32(p); + else + return get_unaligned_le64(p); +} + +/* Unaligned stores with endianness conversion */ + +static void +put_unaligned_le16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(le16_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + } +} + +static void +put_unaligned_be16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(be16_bswap(v), p); + } else { + p[0] = (u8)(v >> 8); + p[1] = (u8)(v >> 0); + } +} + +static void +put_unaligned_le32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(le32_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + } +} + +static void +put_unaligned_be32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(be32_bswap(v), p); + } else { + p[0] = (u8)(v >> 24); + p[1] = (u8)(v >> 16); + p[2] = (u8)(v >> 8); + p[3] = (u8)(v >> 0); + } +} + +static void +put_unaligned_le64(u64 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u64_unaligned(le64_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + p[4] = (u8)(v >> 32); + p[5] = (u8)(v >> 40); + p[6] = (u8)(v >> 48); + p[7] = (u8)(v >> 56); + } +} + +static void +put_unaligned_leword(machine_word_t v, u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + put_unaligned_le32(v, p); + else + put_unaligned_le64(v, p); +} + +/* ========================================================================== */ +/* Bit manipulation functions */ +/* ========================================================================== */ + +/* + * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least + * significant end) of the *most* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static unsigned +bsr32(u32 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_clz) + return 31 - __builtin_clz(v); +#elif defined(_MSC_VER) + unsigned long i; + + _BitScanReverse(&i, v); + return i; +#else + unsigned i = 0; + + while ((v >>= 1) != 0) + i++; + return i; +#endif +} + +static unsigned +bsr64(u64 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) + return 63 - __builtin_clzll(v); +#elif defined(_MSC_VER) && defined(_WIN64) + unsigned long i; + + _BitScanReverse64(&i, v); + return i; +#else + unsigned i = 0; + + while ((v >>= 1) != 0) + i++; + return i; +#endif +} + +static unsigned +bsrw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsr32(v); + else + return bsr64(v); +} + +/* + * Bit Scan Forward (BSF) - find the 0-based index (relative to the least + * significant end) of the *least* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static unsigned +bsf32(u32 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_ctz) + return __builtin_ctz(v); +#elif defined(_MSC_VER) + unsigned long i; + + _BitScanForward(&i, v); + return i; +#else + unsigned i = 0; + + for (; (v & 1) == 0; v >>= 1) + i++; + return i; +#endif +} + +static unsigned +bsf64(u64 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_ctzll) + return __builtin_ctzll(v); +#elif defined(_MSC_VER) && defined(_WIN64) + unsigned long i; + + _BitScanForward64(&i, v); + return i; +#else + unsigned i = 0; + + for (; (v & 1) == 0; v >>= 1) + i++; + return i; +#endif +} + +static unsigned +bsfw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsf32(v); + else + return bsf64(v); +} + +/* + * rbit32(v): reverse the bits in a 32-bit integer. This doesn't have a + * fallback implementation; use '#ifdef rbit32' to check if this is available. + */ +#undef rbit32 +#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \ + (__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__))) +static u32 +rbit32(u32 v) +{ + __asm__("rbit %0, %1" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64) +static u32 +rbit32(u32 v) +{ + __asm__("rbit %w0, %w1" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#endif + +#endif /* COMMON_DEFS_H */ diff --git a/packages/wasm/lib/crypto/COPYING.lesser b/packages/wasm/lib/crypto/COPYING.lesser new file mode 100644 index 00000000..153d416d --- /dev/null +++ b/packages/wasm/lib/crypto/COPYING.lesser @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. \ No newline at end of file diff --git a/packages/crypto/aes256.c b/packages/wasm/lib/crypto/aes256.c similarity index 100% rename from packages/crypto/aes256.c rename to packages/wasm/lib/crypto/aes256.c diff --git a/packages/crypto/aes256.h b/packages/wasm/lib/crypto/aes256.h similarity index 84% rename from packages/crypto/aes256.h rename to packages/wasm/lib/crypto/aes256.h index c7af249b..c84ea460 100644 --- a/packages/crypto/aes256.h +++ b/packages/wasm/lib/crypto/aes256.h @@ -1,12 +1,11 @@ -#include -#include -#include +#include "lib_common.h" #ifndef AES256_H #define AES256_H #define AES_BLOCK_SIZE 16 #define EXPANDED_KEY_SIZE 60 +#define AES_EXPORT __attribute__((visibility("default"))) #ifdef __cplusplus extern "C" { diff --git a/packages/wasm/lib/crypto/ctr256.c b/packages/wasm/lib/crypto/ctr256.c new file mode 100644 index 00000000..71239e59 --- /dev/null +++ b/packages/wasm/lib/crypto/ctr256.c @@ -0,0 +1,55 @@ +#include "aes256.h" + +struct ctr256_ctx { + uint32_t expandedKey[EXPANDED_KEY_SIZE]; + uint8_t* iv; + uint8_t state; +}; + +AES_EXPORT struct ctr256_ctx* ctr256_alloc(uint8_t* key, uint8_t* iv) { + struct ctr256_ctx *state = (struct ctr256_ctx *) __malloc(sizeof(struct ctr256_ctx)); + aes256_set_encryption_key(key, state->expandedKey); + __free(key); + + state->iv = iv; + state->state = 0; + + return state; +} + +AES_EXPORT void ctr256_free(struct ctr256_ctx* ctx) { + __free(ctx->iv); + __free(ctx); +} + +AES_EXPORT void ctr256(struct ctr256_ctx* ctx, uint8_t* in, uint32_t length, uint8_t *out) { + uint8_t chunk[AES_BLOCK_SIZE]; + uint32_t* expandedKey = ctx->expandedKey; + uint8_t* iv = ctx->iv; + uint8_t state = ctx->state; + uint32_t i, j, k; + + aes256_encrypt(iv, chunk, expandedKey); + + for (i = 0; i < length; i += AES_BLOCK_SIZE) { + for (j = 0; j < MIN(length - i, AES_BLOCK_SIZE); ++j) { + out[i + j] = in[i + j] ^ chunk[state++]; + + if (state >= AES_BLOCK_SIZE) + state = 0; + + if (state == 0) { + k = AES_BLOCK_SIZE; + while(k--) + if (++iv[k]) + break; + + aes256_encrypt(iv, chunk, expandedKey); + } + } + } + + __free(in); + + ctx->state = state; +} \ No newline at end of file diff --git a/packages/wasm/lib/crypto/ctr256.h b/packages/wasm/lib/crypto/ctr256.h new file mode 100644 index 00000000..9c984a09 --- /dev/null +++ b/packages/wasm/lib/crypto/ctr256.h @@ -0,0 +1,6 @@ +#ifndef CTR256_H +#define CTR256_H + +uint8_t *ctr256(const uint8_t in[], uint32_t length, const uint8_t key[32], uint8_t iv[16], uint8_t *state); + +#endif \ No newline at end of file diff --git a/packages/crypto/ige256.c b/packages/wasm/lib/crypto/ige256.c similarity index 84% rename from packages/crypto/ige256.c rename to packages/wasm/lib/crypto/ige256.c index 1e1cc731..6bc054ed 100644 --- a/packages/crypto/ige256.c +++ b/packages/wasm/lib/crypto/ige256.c @@ -1,6 +1,6 @@ #include "aes256.h" -void ige256_encrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) { +AES_EXPORT void ige256_encrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) { uint32_t expandedKey[EXPANDED_KEY_SIZE]; uint32_t i, j; @@ -29,7 +29,7 @@ void ige256_encrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uin } } -void ige256_decrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) { +AES_EXPORT void ige256_decrypt(uint8_t* in, uint32_t length, uint8_t* key, uint8_t* iv, uint8_t* out) { uint32_t expandedKey[EXPANDED_KEY_SIZE]; uint32_t i, j; diff --git a/packages/crypto/ige256.h b/packages/wasm/lib/crypto/ige256.h similarity index 100% rename from packages/crypto/ige256.h rename to packages/wasm/lib/crypto/ige256.h diff --git a/packages/wasm/lib/lib_common.h b/packages/wasm/lib/lib_common.h new file mode 100644 index 00000000..8bf32b1c --- /dev/null +++ b/packages/wasm/lib/lib_common.h @@ -0,0 +1,62 @@ +/* + * lib_common.h - internal header included by all library code + */ + +#ifndef LIB_LIB_COMMON_H +#define LIB_LIB_COMMON_H + +#ifdef LIBDEFLATE_H + /* + * When building the library, LIBDEFLATEAPI needs to be defined properly before + * including libdeflate.h. + */ +# error "lib_common.h must always be included before libdeflate.h" +#endif + +#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATE_EXPORT_SYM __declspec(dllexport) +#elif defined(__GNUC__) +# define LIBDEFLATE_EXPORT_SYM __attribute__((visibility("default"))) +#else +# define LIBDEFLATE_EXPORT_SYM +#endif + +/* + * On i386, gcc assumes that the stack is 16-byte aligned at function entry. + * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi) + * only guarantee 4-byte alignment when calling functions. This is mainly an + * issue on Windows, but it has been seen on Linux too. Work around this ABI + * incompatibility by realigning the stack pointer when entering libdeflate. + * This prevents crashes in SSE/AVX code. + */ +#if defined(__GNUC__) && defined(__i386__) +# define LIBDEFLATE_ALIGN_STACK __attribute__((force_align_arg_pointer)) +#else +# define LIBDEFLATE_ALIGN_STACK +#endif + +#define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK + +#include "common_defs.h" + +extern void* __malloc(size_t size); +extern void __free(void* ptr); + +void *libdeflate_aligned_malloc(size_t alignment, size_t size); +void libdeflate_aligned_free(void *ptr); + +#define ASSERT(expr) (void)(expr) +#define CONCAT_IMPL(a, b) a##b +#define CONCAT(a, b) CONCAT_IMPL(a, b) +#define ADD_SUFFIX(name) CONCAT(name, SUFFIX) + +#ifdef LOGGING +void __debug(char* str); + +#define DEBUG(str) __debug(str); + +#else +#define DEBUG(str) +#endif + +#endif /* LIB_LIB_COMMON_H */ diff --git a/packages/wasm/lib/libdeflate.h b/packages/wasm/lib/libdeflate.h new file mode 100644 index 00000000..1ac01833 --- /dev/null +++ b/packages/wasm/lib/libdeflate.h @@ -0,0 +1,245 @@ +/* + * libdeflate.h - public header for libdeflate + */ + +#ifndef LIBDEFLATE_H +#define LIBDEFLATE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBDEFLATE_VERSION_MAJOR 1 +#define LIBDEFLATE_VERSION_MINOR 19 +#define LIBDEFLATE_VERSION_STRING "1.19" + +/* + * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause + * __declspec(dllimport) to be used. This should be done when it's easy to do. + * Otherwise it's fine to skip it, since it is a very minor performance + * optimization that is irrelevant for most use cases of libdeflate. + */ +#ifndef LIBDEFLATEAPI +# if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATEAPI __declspec(dllimport) +# else +# define LIBDEFLATEAPI +# endif +#endif + +/* ========================================================================== */ +/* Compression */ +/* ========================================================================== */ + +struct libdeflate_compressor; +struct libdeflate_options; + +/* + * libdeflate_alloc_compressor() allocates a new compressor that supports + * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression + * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = + * medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means + * "no compression", specifically "create a valid stream, but only emit + * uncompressed blocks" (this will expand the data slightly). + * + * The return value is a pointer to the new compressor, or NULL if out of memory + * or if the compression level is invalid (i.e. outside the range [0, 12]). + * + * Note: for compression, the sliding window size is defined at compilation time + * to 32768, the largest size permissible in the DEFLATE format. It cannot be + * changed at runtime. + * + * A single compressor is not safe to use by multiple threads concurrently. + * However, different threads may use different compressors concurrently. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor(int compression_level); + +/* + * Like libdeflate_alloc_compressor(), but adds the 'options' argument. + */ +//LIBDEFLATEAPI struct libdeflate_compressor * +//libdeflate_alloc_compressor_ex(int compression_level, +// const struct libdeflate_options *options); + +LIBDEFLATEAPI size_t +libdeflate_gzip_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +//LIBDEFLATEAPI size_t +//libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, +// size_t in_nbytes); + +/* + * libdeflate_free_compressor() frees a compressor that was allocated with + * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is + * taken. + */ +LIBDEFLATEAPI void +libdeflate_free_compressor(struct libdeflate_compressor *compressor); + +/* ========================================================================== */ +/* Decompression */ +/* ========================================================================== */ + +struct libdeflate_decompressor; +struct libdeflate_options; + +/* + * libdeflate_alloc_decompressor() allocates a new decompressor that can be used + * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to + * the new decompressor, or NULL if out of memory. + * + * This function takes no parameters, and the returned decompressor is valid for + * decompressing data that was compressed at any compression level and with any + * sliding window size. + * + * A single decompressor is not safe to use by multiple threads concurrently. + * However, different threads may use different decompressors concurrently. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void); + +/* + * Like libdeflate_alloc_decompressor(), but adds the 'options' argument. + */ +//LIBDEFLATEAPI struct libdeflate_decompressor * +//libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options); + +/* + * Result of a call to libdeflate_deflate_decompress(), + * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). + */ +enum libdeflate_result { + /* Decompression was successful. */ + LIBDEFLATE_SUCCESS = 0, + + /* Decompression failed because the compressed data was invalid, + * corrupt, or otherwise unsupported. */ + LIBDEFLATE_BAD_DATA = 1, + + /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have + * decompressed to fewer than 'out_nbytes_avail' bytes. */ + LIBDEFLATE_SHORT_OUTPUT = 2, + + /* The data would have decompressed to more than 'out_nbytes_avail' + * bytes. */ + LIBDEFLATE_INSUFFICIENT_SPACE = 3, +}; + +/* + * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer + * 'in' with compressed size up to 'in_nbytes' bytes. The uncompressed data is + * written to 'out', a buffer with size 'out_nbytes_avail' bytes. If + * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. Otherwise, + * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the + * contents of the output buffer are undefined. + * + * Decompression stops at the end of the DEFLATE stream (as indicated by the + * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. + * + * libdeflate_deflate_decompress() can be used in cases where the actual + * uncompressed size is known (recommended) or unknown (not recommended): + * + * - If the actual uncompressed size is known, then pass the actual + * uncompressed size as 'out_nbytes_avail' and pass NULL for + * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail + * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the + * specified number of bytes. + * + * - If the actual uncompressed size is unknown, then provide a non-NULL + * 'actual_out_nbytes_ret' and provide a buffer with some size + * 'out_nbytes_avail' that you think is large enough to hold all the + * uncompressed data. In this case, if the data decompresses to less than + * or equal to 'out_nbytes_avail' bytes, then + * libdeflate_deflate_decompress() will write the actual uncompressed size + * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, + * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was + * not large enough but no other problems were encountered, or another + * nonzero result code if decompression failed for another reason. + */ +//LIBDEFLATEAPI enum libdeflate_result +//libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, +// const void *in, size_t in_nbytes, +// void *out, size_t out_nbytes_avail, +// size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, + * then the actual compressed size of the DEFLATE stream (aligned to the next + * byte boundary) is written to *actual_in_nbytes_ret. + */ +enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format + * instead of raw DEFLATE. + * + * If multiple gzip-compressed members are concatenated, then only the first + * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need + * multi-member support. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + + +/* + * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first gzip-compressed member in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +//LIBDEFLATEAPI enum libdeflate_result +//libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, +// const void *in, size_t in_nbytes, +// void *out, size_t out_nbytes_avail, +// size_t *actual_in_nbytes_ret, +// size_t *actual_out_nbytes_ret); + +/* + * libdeflate_free_decompressor() frees a decompressor that was allocated with + * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action + * is taken. + */ +LIBDEFLATEAPI void +libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); + +/* + * Advanced options. This is the options structure that + * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex() + * require. Most users won't need this and should just use the non-"_ex" + * functions instead. If you do need this, it should be initialized like this: + * + * struct libdeflate_options options; + * + * __builtin_memset(&options, 0, sizeof(options)); + * options.sizeof_options = sizeof(options); + * // Then set the fields that you need to override the defaults for. + */ +struct libdeflate_options { + /* + * This field must be set to the struct size. This field exists for + * extensibility, so that fields can be appended to this struct in + * future versions of libdeflate while still supporting old binaries. + */ + size_t sizeof_options; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* LIBDEFLATE_H */ diff --git a/packages/wasm/lib/libdeflate/COPYING b/packages/wasm/lib/libdeflate/COPYING new file mode 100644 index 00000000..1f1b81cd --- /dev/null +++ b/packages/wasm/lib/libdeflate/COPYING @@ -0,0 +1,21 @@ +Copyright 2016 Eric Biggers + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation files +(the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, +and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/wasm/lib/libdeflate/adler32.c b/packages/wasm/lib/libdeflate/adler32.c new file mode 100644 index 00000000..9043000a --- /dev/null +++ b/packages/wasm/lib/libdeflate/adler32.c @@ -0,0 +1,123 @@ +/* + * adler32.c - Adler-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "lib_common.h" + +/* The Adler-32 divisor, or "base", value */ +#define DIVISOR 65521 + +/* + * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility + * of s2 overflowing when it is represented as an unsigned 32-bit integer. This + * value was computed using the following Python script: + * + * divisor = 65521 + * count = 0 + * s1 = divisor - 1 + * s2 = divisor - 1 + * while True: + * s1 += 0xFF + * s2 += s1 + * if s2 > 0xFFFFFFFF: + * break + * count += 1 + * print(count) + * + * Note that to get the correct worst-case value, we must assume that every byte + * has value 0xFF and that s1 and s2 started with the highest possible values + * modulo the divisor. + */ +#define MAX_CHUNK_LEN 5552 + +static u32 +adler32_generic(u32 adler, const u8 *p, size_t len) +{ + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + const u8 * const end = p + len; + + while (p != end) { + size_t chunk_len = MIN(end - p, MAX_CHUNK_LEN); + const u8 *chunk_end = p + chunk_len; + size_t num_unrolled_iterations = chunk_len / 4; + + while (num_unrolled_iterations--) { + s1 += *p++; + s2 += s1; + s1 += *p++; + s2 += s1; + s1 += *p++; + s2 += s1; + s1 += *p++; + s2 += s1; + } + while (p != chunk_end) { + s1 += *p++; + s2 += s1; + } + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + return (s2 << 16) | s1; +} + +/* Include architecture-specific implementation(s) if available. */ +#undef DEFAULT_IMPL +#undef arch_select_adler32_func +typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len); + +#define DEFAULT_IMPL adler32_generic + +#ifdef arch_select_adler32_func +static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len); + +static volatile adler32_func_t adler32_impl = dispatch_adler32; + +/* Choose the best implementation at runtime. */ +static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len) +{ + adler32_func_t f = arch_select_adler32_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + adler32_impl = f; + return f(adler, p, len); +} +#else +/* The best implementation is statically known, so call it directly. */ +#define adler32_impl DEFAULT_IMPL +#endif + +u32 +libdeflate_adler32(u32 adler, const void *buffer, size_t len) +{ + if (buffer == NULL) /* Return initial value. */ + return 1; + return adler32_impl(adler, buffer, len); +} diff --git a/packages/wasm/lib/libdeflate/adler32.h b/packages/wasm/lib/libdeflate/adler32.h new file mode 100644 index 00000000..5468e14c --- /dev/null +++ b/packages/wasm/lib/libdeflate/adler32.h @@ -0,0 +1,8 @@ +#ifndef LIB_DEFLATE_ADLER32_H +#define LIB_DEFLATE_ADLER32_H + +#include "lib_common.h" + +u32 libdeflate_adler32(u32 adler, const void *buffer, size_t len); + +#endif /* LIB_DEFLATE_ADLER32_H */ diff --git a/packages/wasm/lib/libdeflate/bt_matchfinder.h b/packages/wasm/lib/libdeflate/bt_matchfinder.h new file mode 100644 index 00000000..7bc4f04d --- /dev/null +++ b/packages/wasm/lib/libdeflate/bt_matchfinder.h @@ -0,0 +1,342 @@ +/* + * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ---------------------------------------------------------------------------- + * + * This is a Binary Trees (bt) based matchfinder. + * + * The main data structure is a hash table where each hash bucket contains a + * binary tree of sequences whose first 4 bytes share the same hash code. Each + * sequence is identified by its starting position in the input buffer. Each + * binary tree is always sorted such that each left child represents a sequence + * lexicographically lesser than its parent and each right child represents a + * sequence lexicographically greater than its parent. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 4 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, a new binary tree + * node is created to represent the current sequence. Then, in a single tree + * traversal, the hash bucket's binary tree is searched for matches and is + * re-rooted at the new node. + * + * Compared to the simpler algorithm that uses linked lists instead of binary + * trees (see hc_matchfinder.h), the binary tree version gains more information + * at each node visitation. Ideally, the binary tree version will examine only + * 'log(n)' nodes to find the same matches that the linked list version will + * find by examining 'n' nodes. In addition, the binary tree version can + * examine fewer bytes at each node by taking advantage of the common prefixes + * that result from the sort order, whereas the linked list version may have to + * examine up to the full length of the match at each node. + * + * However, it is not always best to use the binary tree version. It requires + * nearly twice as much memory as the linked list version, and it takes time to + * keep the binary trees sorted, even at positions where the compressor does not + * need matches. Generally, when doing fast compression on small buffers, + * binary trees are the wrong approach. They are best suited for thorough + * compression and/or large buffers. + * + * ---------------------------------------------------------------------------- + */ + +#ifndef LIB_BT_MATCHFINDER_H +#define LIB_BT_MATCHFINDER_H + +#include "matchfinder_common.h" + +#define BT_MATCHFINDER_HASH3_ORDER 16 +#define BT_MATCHFINDER_HASH3_WAYS 2 +#define BT_MATCHFINDER_HASH4_ORDER 16 + +#define BT_MATCHFINDER_TOTAL_HASH_SIZE \ + (((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \ + (1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) + +/* Representation of a match found by the bt_matchfinder */ +struct lz_match { + + /* The number of bytes matched. */ + u16 length; + + /* The offset back from the current position that was matched. */ + u16 offset; +}; + +struct MATCHFINDER_ALIGNED bt_matchfinder { + + /* The hash table for finding length 3 matches */ + mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS]; + + /* The hash table which contains the roots of the binary trees for + * finding length 4+ matches */ + mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER]; + + /* The child node references for the binary trees. The left and right + * children of the node for the sequence with position 'pos' are + * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */ + mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE]; +}; + +/* Prepare the matchfinder for a new input buffer. */ +static void +bt_matchfinder_init(struct bt_matchfinder *mf) +{ + STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE % + MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE); +} + +static void +bt_matchfinder_slide_window(struct bt_matchfinder *mf) +{ + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); +} + +static mf_pos_t * +bt_left_child(struct bt_matchfinder *mf, s32 node) +{ + return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0]; +} + +static mf_pos_t * +bt_right_child(struct bt_matchfinder *mf, s32 node) +{ + return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1]; +} + +/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches() + * and bt_matchfinder_skip_byte(). There must be sufficiently many bytes + * remaining to load a 32-bit integer from the *next* position. */ +#define BT_MATCHFINDER_REQUIRED_NBYTES 5 + +/* Advance the binary tree matchfinder by one byte, optionally recording + * matches. @record_matches should be a compile-time constant. */ +static struct lz_match * +bt_matchfinder_advance_one_byte(struct bt_matchfinder * const mf, + const u8 * const in_base, + const ptrdiff_t cur_pos, + const u32 max_len, + const u32 nice_len, + const u32 max_search_depth, + u32 * const next_hashes, + struct lz_match *lz_matchptr, + const bool record_matches) +{ + const u8 *in_next = in_base + cur_pos; + u32 depth_remaining = max_search_depth; + const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; + u32 next_hashseq; + u32 hash3; + u32 hash4; + s32 cur_node; +#if BT_MATCHFINDER_HASH3_WAYS >= 2 + s32 cur_node_2; +#endif + const u8 *matchptr; + mf_pos_t *pending_lt_ptr, *pending_gt_ptr; + u32 best_lt_len, best_gt_len; + u32 len; + u32 best_len = 3; + + STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 && + BT_MATCHFINDER_HASH3_WAYS <= 2); + + next_hashseq = get_unaligned_le32(in_next + 1); + + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + + next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + + cur_node = mf->hash3_tab[hash3][0]; + mf->hash3_tab[hash3][0] = cur_pos; +#if BT_MATCHFINDER_HASH3_WAYS >= 2 + cur_node_2 = mf->hash3_tab[hash3][1]; + mf->hash3_tab[hash3][1] = cur_node; +#endif + if (record_matches && cur_node > cutoff) { + u32 seq3 = load_u24_unaligned(in_next); + if (seq3 == load_u24_unaligned(&in_base[cur_node])) { + lz_matchptr->length = 3; + lz_matchptr->offset = in_next - &in_base[cur_node]; + lz_matchptr++; + } + #if BT_MATCHFINDER_HASH3_WAYS >= 2 + else if (cur_node_2 > cutoff && + seq3 == load_u24_unaligned(&in_base[cur_node_2])) + { + lz_matchptr->length = 3; + lz_matchptr->offset = in_next - &in_base[cur_node_2]; + lz_matchptr++; + } + #endif + } + + cur_node = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; + + pending_lt_ptr = bt_left_child(mf, cur_pos); + pending_gt_ptr = bt_right_child(mf, cur_pos); + + if (cur_node <= cutoff) { + *pending_lt_ptr = MATCHFINDER_INITVAL; + *pending_gt_ptr = MATCHFINDER_INITVAL; + return lz_matchptr; + } + + best_lt_len = 0; + best_gt_len = 0; + len = 0; + + for (;;) { + matchptr = &in_base[cur_node]; + + if (matchptr[len] == in_next[len]) { + len = lz_extend(in_next, matchptr, len + 1, max_len); + if (!record_matches || len > best_len) { + if (record_matches) { + best_len = len; + lz_matchptr->length = len; + lz_matchptr->offset = in_next - matchptr; + lz_matchptr++; + } + if (len >= nice_len) { + *pending_lt_ptr = *bt_left_child(mf, cur_node); + *pending_gt_ptr = *bt_right_child(mf, cur_node); + return lz_matchptr; + } + } + } + + if (matchptr[len] < in_next[len]) { + *pending_lt_ptr = cur_node; + pending_lt_ptr = bt_right_child(mf, cur_node); + cur_node = *pending_lt_ptr; + best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; + } else { + *pending_gt_ptr = cur_node; + pending_gt_ptr = bt_left_child(mf, cur_node); + cur_node = *pending_gt_ptr; + best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (cur_node <= cutoff || !--depth_remaining) { + *pending_lt_ptr = MATCHFINDER_INITVAL; + *pending_gt_ptr = MATCHFINDER_INITVAL; + return lz_matchptr; + } + } +} + +/* + * Retrieve a list of matches with the current position. + * + * @mf + * The matchfinder structure. + * @in_base + * Pointer to the next byte in the input buffer to process _at the last + * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_. + * @cur_pos + * The current position in the input buffer relative to @in_base (the + * position of the sequence being matched against). + * @max_len + * The maximum permissible match length at this position. Must be >= + * BT_MATCHFINDER_REQUIRED_NBYTES. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. + * @lz_matchptr + * An array in which this function will record the matches. The recorded + * matches will be sorted by strictly increasing length and (non-strictly) + * increasing offset. The maximum number of matches that may be found is + * 'nice_len - 2'. + * + * The return value is a pointer to the next available slot in the @lz_matchptr + * array. (If no matches were found, this will be the same as @lz_matchptr.) + */ +static struct lz_match * +bt_matchfinder_get_matches(struct bt_matchfinder *mf, + const u8 *in_base, + ptrdiff_t cur_pos, + u32 max_len, + u32 nice_len, + u32 max_search_depth, + u32 next_hashes[2], + struct lz_match *lz_matchptr) +{ + return bt_matchfinder_advance_one_byte(mf, + in_base, + cur_pos, + max_len, + nice_len, + max_search_depth, + next_hashes, + lz_matchptr, + true); +} + +/* + * Advance the matchfinder, but don't record any matches. + * + * This is very similar to bt_matchfinder_get_matches() because both functions + * must do hashing and tree re-rooting. + */ +static void +bt_matchfinder_skip_byte(struct bt_matchfinder *mf, + const u8 *in_base, + ptrdiff_t cur_pos, + u32 nice_len, + u32 max_search_depth, + u32 next_hashes[2]) +{ + bt_matchfinder_advance_one_byte(mf, + in_base, + cur_pos, + nice_len, + nice_len, + max_search_depth, + next_hashes, + NULL, + false); +} + +#endif /* LIB_BT_MATCHFINDER_H */ diff --git a/packages/wasm/lib/libdeflate/decompress_template.h b/packages/wasm/lib/libdeflate/decompress_template.h new file mode 100644 index 00000000..ac1987f3 --- /dev/null +++ b/packages/wasm/lib/libdeflate/decompress_template.h @@ -0,0 +1,777 @@ +/* + * decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is the actual DEFLATE decompression routine, lifted out of + * deflate_decompress.c so that it can be compiled multiple times with different + * target instruction sets. + */ + +#ifndef ATTRIBUTES +# define ATTRIBUTES +#endif +#ifndef EXTRACT_VARBITS +# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) +#endif +#ifndef EXTRACT_VARBITS8 +# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) +#endif + +static enum libdeflate_result ATTRIBUTES +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + u8 * const out_fastloop_end = + out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); + + /* Input bitstream state; see deflate_decompress.c for documentation */ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + const u8 * const in_fastloop_end = + in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); + bitbuf_t bitbuf = 0; + bitbuf_t saved_bitbuf; + u32 bitsleft = 0; + size_t overread_count = 0; + + bool is_final_block; + unsigned block_type; + unsigned num_litlen_syms; + unsigned num_offset_syms; + bitbuf_t litlen_tablemask; + u32 entry; + +next_block: + /* Starting to read the next block */ + ; + + STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); + REFILL_BITS(); + + /* BFINAL: 1 bit */ + is_final_block = bitbuf & BITMASK(1); + + /* BTYPE: 2 bits */ + block_type = (bitbuf >> 1) & BITMASK(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block */ + + /* The order in which precode lengths are stored */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + unsigned i; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); + num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); + num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); + num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); + + d->static_codes_loaded = false; + + /* + * Read the precode codeword lengths. + * + * A 64-bit bitbuffer is just one bit too small to hold the + * maximum number of precode lens, so to minimize branches we + * merge one len with the previous fields. + */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + d->u.precode_lens[deflate_precode_lens_permutation[0]] = + (bitbuf >> 17) & BITMASK(3); + bitbuf >>= 20; + bitsleft -= 20; + REFILL_BITS(); + i = 1; + do { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } else { + bitbuf >>= 17; + bitsleft -= 17; + i = 0; + do { + if ((u8)bitsleft < 3) + REFILL_BITS(); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Decode the litlen and offset codeword lengths. */ + i = 0; + do { + unsigned presym; + u8 rep_val; + unsigned rep_count; + + if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) + REFILL_BITS(); + + /* + * The code below assumes that the precode decode table + * doesn't have any subtables. + */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Decode the next precode symbol. */ + entry = d->u.l.precode_decode_table[ + bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + presym = entry >> 16; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times. */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + BITMASK(2) == 6); + rep_count = 3 + (bitbuf & BITMASK(2)); + bitbuf >>= 2; + bitsleft -= 2; + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times. */ + STATIC_ASSERT(3 + BITMASK(3) == 10); + rep_count = 3 + (bitbuf & BITMASK(3)); + bitbuf >>= 3; + bitsleft -= 3; + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times. */ + STATIC_ASSERT(11 + BITMASK(7) == 138); + rep_count = 11 + (bitbuf & BITMASK(7)); + bitbuf >>= 7; + bitsleft -= 7; + __builtin_memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } while (i < num_litlen_syms + num_offset_syms); + + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + u16 len, nlen; + + /* + * Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. + */ + + bitsleft -= 3; /* for BTYPE and BFINAL */ + + /* + * Align the bitstream to the next byte boundary. This means + * the next byte boundary as if we were reading a byte at a + * time. Therefore, we have to rewind 'in_next' by any bytes + * that have been refilled but not actually consumed yet (not + * counting overread bytes, which don't increment 'in_next'). + */ + bitsleft = (u8)bitsleft; + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + in_next -= (bitsleft >> 3) - overread_count; + overread_count = 0; + bitbuf = 0; + bitsleft = 0; + + SAFETY_CHECK(in_end - in_next >= 4); + len = get_unaligned_le16(in_next); + nlen = get_unaligned_le16(in_next + 2); + in_next += 4; + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + __builtin_memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + unsigned i; + + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + bitbuf >>= 3; /* for BTYPE and BFINAL */ + bitsleft -= 3; + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + litlen_tablemask = BITMASK(d->litlen_tablebits); + + /* + * This is the "fastloop" for decoding literals and matches. It does + * bounds checks on in_next and out_next in the loop conditions so that + * additional bounds checks aren't needed inside the loop body. + * + * To reduce latency, the bitbuffer is refilled and the next litlen + * decode table entry is preloaded before each loop iteration. + */ + if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) + goto generic_loop; + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + do { + u32 length, offset, lit; + const u8 *src; + u8 *dst; + + /* + * Consume the bits for the litlen decode table entry. Save the + * original bitbuf for later, in case the extra match length + * bits need to be extracted from it. + */ + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + + /* + * Begin by checking for a "fast" literal, i.e. a literal that + * doesn't need a subtable. + */ + if (entry & HUFFDEC_LITERAL) { + /* + * On 64-bit platforms, we decode up to 2 extra fast + * literals in addition to the primary item, as this + * increases performance and still leaves enough bits + * remaining for what follows. We could actually do 3, + * assuming LITLEN_TABLEBITS=11, but that actually + * decreases performance slightly (perhaps by messing + * with the branch prediction of the conditional refill + * that happens later while decoding the match offset). + * + * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN + * and FASTLOOP_MAX_BYTES_READ need to be updated if the + * number of extra literals decoded here is changed. + */ + if (/* enough bits for 2 fast literals + length + offset preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + LENGTH_MAXBITS, + OFFSET_TABLEBITS) && + /* enough bits for 2 fast literals + slow literal + litlen preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS)) { + /* 1st extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* 2nd extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* + * Another fast literal, but + * this one is in lieu of the + * primary item, so it doesn't + * count as one of the extras. + */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + } else { + /* + * Decode a literal. While doing so, preload + * the next litlen decode table entry and refill + * the bitbuffer. To reduce latency, we've + * arranged for there to be enough "preloadable" + * bits remaining to do the table preload + * independently of the refill. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( + LITLEN_TABLEBITS, LITLEN_TABLEBITS)); + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + + /* + * It's not a literal entry, so it can be a length entry, a + * subtable pointer entry, or an end-of-block entry. Detect the + * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Subtable pointer or end-of-block entry */ + + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + + /* + * A subtable is required. Load and consume the + * subtable entry. The subtable entry can be of any + * type: literal, length, or end-of-block. + */ + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + + /* + * 32-bit platforms that use the byte-at-a-time refill + * method have to do a refill here for there to always + * be enough bits to decode a literal that requires a + * subtable, then preload the next litlen decode table + * entry; or to decode a match length that requires a + * subtable, then preload the offset decode table entry. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS) || + !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, + OFFSET_TABLEBITS)) + REFILL_BITS_IN_FASTLOOP(); + if (entry & HUFFDEC_LITERAL) { + /* Decode a literal that required a subtable. */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + /* Else, it's a length that required a subtable. */ + } + + /* + * Decode the match length: the length base value associated + * with the litlen symbol (which we extract from the decode + * table entry), plus the extra length bits. We don't need to + * consume the extra length bits here, as they were included in + * the bits consumed by the entry earlier. We also don't need + * to check for too-long matches here, as this is inside the + * fastloop where it's already been verified that the output + * buffer has enough space remaining to copy a max-length match. + */ + length = entry >> 16; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* + * Decode the match offset. There are enough "preloadable" bits + * remaining to preload the offset decode table entry, but a + * refill might be needed before consuming it. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, + OFFSET_TABLEBITS)); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, + LITLEN_TABLEBITS)) { + /* + * Decoding a match offset on a 64-bit platform. We may + * need to refill once, but then we can decode the whole + * offset and preload the next litlen table entry. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + if (unlikely((u8)bitsleft < OFFSET_MAXBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + } else { + /* Decoding a match offset on a 32-bit platform */ + REFILL_BITS_IN_FASTLOOP(); + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + REFILL_BITS_IN_FASTLOOP(); + /* No further refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME( + OFFSET_MAXBITS - OFFSET_TABLEBITS)); + } else { + /* No refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); + } + } + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + offset = entry >> 16; + offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* Validate the match offset; needed even in the fastloop. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + /* + * Before starting to issue the instructions to copy the match, + * refill the bitbuffer and preload the litlen decode table + * entry for the next loop iteration. This can increase + * performance by allowing the latency of the match copy to + * overlap with these other operations. To further reduce + * latency, we've arranged for there to be enough bits remaining + * to do the table preload independently of the refill, except + * on 32-bit platforms using the byte-at-a-time refill method. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD( + MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, + OFFSET_MAXFASTBITS), + LITLEN_TABLEBITS) && + unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + + /* + * Copy the match. On most CPUs the fastest method is a + * word-at-a-time copy, unconditionally copying about 5 words + * since this is enough for most matches without being too much. + * + * The normal word-at-a-time copy works for offset >= WORDBYTES, + * which is most cases. The case of offset == 1 is also common + * and is worth optimizing for, since it is just RLE encoding of + * the previous byte, which is the result of compressing long + * runs of the same byte. + * + * Writing past the match 'length' is allowed here, since it's + * been ensured there is enough output space left for a slight + * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if + * the maximum possible overrun here is changed. + */ + if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { + machine_word_t v; + + /* + * This part tends to get auto-vectorized, so keep it + * copying a multiple of 16 bytes at a time. + */ + v = (machine_word_t)0x0101010101010101 * src[0]; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST) { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + do { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } while (in_next < in_fastloop_end && out_next < out_fastloop_end); + + /* + * This is the generic loop for decoding literals and matches. This + * handles cases where in_next and out_next are close to the end of + * their respective buffers. Usually this loop isn't performance- + * critical, as most time is spent in the fastloop above instead. We + * therefore omit some optimizations here in favor of smaller code. + */ +generic_loop: + for (;;) { + u32 length, offset; + const u8 *src; + u8 *dst; + + REFILL_BITS(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + } + length = entry >> 16; + if (entry & HUFFDEC_LITERAL) { + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = length; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + if (unlikely(length > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + + if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) + REFILL_BITS(); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + if (!CAN_CONSUME(OFFSET_MAXBITS)) + REFILL_BITS(); + } + offset = entry >> 16; + offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); + bitbuf >>= (u8)entry; + bitsleft -= entry; + + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + +block_done: + /* Finished decoding a block */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + bitsleft = (u8)bitsleft; + + /* + * If any of the implicit appended zero bytes were consumed (not just + * refilled) before hitting end of stream, then the data is bad. + */ + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + + /* Optionally return the actual number of bytes consumed. */ + if (actual_in_nbytes_ret) { + /* Don't count bytes that were refilled but not consumed. */ + in_next -= (bitsleft >> 3) - overread_count; + + *actual_in_nbytes_ret = in_next - (u8 *)in; + } + + /* Optionally return the actual number of bytes written. */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 diff --git a/packages/wasm/lib/libdeflate/deflate_compress.c b/packages/wasm/lib/libdeflate/deflate_compress.c new file mode 100644 index 00000000..14b92d5a --- /dev/null +++ b/packages/wasm/lib/libdeflate/deflate_compress.c @@ -0,0 +1,4119 @@ +/* + * deflate_compress.c - a compressor for DEFLATE + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "deflate_compress.h" +#include "deflate_constants.h" + +/******************************************************************************/ + +/* + * The following parameters can be changed at build time to customize the + * compression algorithms slightly: + * + * (Note, not all customizable parameters are here. Some others can be found in + * libdeflate_alloc_compressor() and in *_matchfinder.h.) + */ + +/* + * If this parameter is defined to 1, then the near-optimal parsing algorithm + * will be included, and compression levels 10-12 will use it. This algorithm + * usually produces a compression ratio significantly better than the other + * algorithms. However, it is slow. If this parameter is defined to 0, then + * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm. + */ +#define SUPPORT_NEAR_OPTIMAL_PARSING 1 + +/* + * This is the minimum block length that the compressor will use, in + * uncompressed bytes. This should be a value below which using shorter blocks + * is unlikely to be worthwhile, due to the per-block overhead. This value does + * not apply to the final block, which may be shorter than this (if the input is + * shorter, it will have to be), or to the final uncompressed block in a series + * of uncompressed blocks that cover more than UINT16_MAX bytes. + * + * This value is also approximately the amount by which what would otherwise be + * the second-to-last block is allowed to grow past the soft maximum length in + * order to avoid having to use a very short final block. + * + * Defining a fixed minimum block length is needed in order to guarantee a + * reasonable upper bound on the compressed size. It's also needed because our + * block splitting algorithm doesn't work well on very short blocks. + */ +#define MIN_BLOCK_LENGTH 5000 + +/* + * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft + * maximum block length, in uncompressed bytes. The compressor will try to end + * blocks at this length, but it may go slightly past it if there is a match + * that straddles this limit or if the input data ends soon after this limit. + * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format + * limits to 65535 bytes. + * + * This should be a value above which it is very likely that splitting the block + * would produce a better compression ratio. For the near-optimal compressor, + * increasing/decreasing this parameter will increase/decrease per-compressor + * memory usage linearly. + */ +#define SOFT_MAX_BLOCK_LENGTH 300000 + +/* + * For the greedy, lazy, and lazy2 compressors: this is the length of the + * sequence store, which is an array where the compressor temporarily stores + * matches that it's going to use in the current block. This value is the + * maximum number of matches that can be used in a block. If the sequence store + * fills up, then the compressor will be forced to end the block early. This + * value should be large enough so that this rarely happens, due to the block + * being ended normally before then. Increasing/decreasing this value will + * increase/decrease per-compressor memory usage linearly. + */ +#define SEQ_STORE_LENGTH 50000 + +/* + * For deflate_compress_fastest(): This is the soft maximum block length. + * deflate_compress_fastest() doesn't use the regular block splitting algorithm; + * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or + * FAST_SEQ_STORE_LENGTH matches. Therefore, this value should be lower than + * the regular SOFT_MAX_BLOCK_LENGTH. + */ +#define FAST_SOFT_MAX_BLOCK_LENGTH 65535 + +/* + * For deflate_compress_fastest(): this is the length of the sequence store. + * This is like SEQ_STORE_LENGTH, but this should be a lower value. + */ +#define FAST_SEQ_STORE_LENGTH 8192 + +/* + * These are the maximum codeword lengths, in bits, the compressor will use for + * each Huffman code. The DEFLATE format defines limits for these. However, + * further limiting litlen codewords to 14 bits is beneficial, since it has + * negligible effect on compression ratio but allows some optimizations when + * outputting bits. (It allows 4 literals to be written at once rather than 3.) + */ +#define MAX_LITLEN_CODEWORD_LEN 14 +#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN +#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* Parameters specific to the near-optimal parsing algorithm */ + +/* + * BIT_COST is a scaling factor that allows the near-optimal compressor to + * consider fractional bit costs when deciding which literal/match sequence to + * use. This is useful when the true symbol costs are unknown. For example, if + * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its + * cost to 6.5 bits rather than have to use 6 or 7 bits. Although in the end + * each symbol will use a whole number of bits due to the Huffman coding, + * considering fractional bits can be helpful due to the limited information. + * + * BIT_COST should be a power of 2. A value of 8 or 16 works well. A higher + * value isn't very useful since the calculations are approximate anyway. + * + * BIT_COST doesn't apply to deflate_flush_block() and + * deflate_compute_true_cost(), which consider whole bits. + */ +#define BIT_COST 16 + +/* + * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to + * be needed to output a symbol that was unused in the previous optimization + * pass. Assigning a default cost allows the symbol to be used in the next + * optimization pass. However, the cost should be relatively high because the + * symbol probably won't be used very many times (if at all). + */ +#define LITERAL_NOSTAT_BITS 13 +#define LENGTH_NOSTAT_BITS 13 +#define OFFSET_NOSTAT_BITS 10 + +/* + * This is (slightly less than) the maximum number of matches that the + * near-optimal compressor will cache per block. This behaves similarly to + * SEQ_STORE_LENGTH for the other compressors. + */ +#define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/******************************************************************************/ + +/* Include the needed matchfinders. */ +#define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER +#include "hc_matchfinder.h" +#include "ht_matchfinder.h" +#if SUPPORT_NEAR_OPTIMAL_PARSING +# include "bt_matchfinder.h" +/* + * This is the maximum number of matches the binary trees matchfinder can find + * at a single position. Since the matchfinder never finds more than one match + * for the same length, presuming one of each possible length is sufficient for + * an upper bound. (This says nothing about whether it is worthwhile to + * consider so many matches; this is just defining the worst case.) + */ +#define MAX_MATCHES_PER_POS \ + (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) +#endif + +/* + * The largest block length we will ever use is when the final block is of + * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of + * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN. The latter case + * occurs when the lazy2 compressor chooses two literals and a maximum-length + * match, starting at SOFT_MAX_BLOCK_LENGTH - 1. + */ +#define MAX_BLOCK_LENGTH \ + MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \ + SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN) + +static void +check_buildtime_parameters(void) +{ + /* + * Verify that MIN_BLOCK_LENGTH is being honored, as + * libdeflate_deflate_compress_bound() depends on it. + */ + STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); + STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); + STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >= + MIN_BLOCK_LENGTH); + STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= + MIN_BLOCK_LENGTH); +#if SUPPORT_NEAR_OPTIMAL_PARSING + STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= + MATCH_CACHE_LENGTH); +#endif + + /* The definition of MAX_BLOCK_LENGTH assumes this. */ + STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); + + /* Verify that the sequence stores aren't uselessly large. */ + STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <= + SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); + STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <= + FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); + + /* Verify that the maximum codeword lengths are valid. */ + STATIC_ASSERT( + MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); + STATIC_ASSERT( + MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); + STATIC_ASSERT( + MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); + STATIC_ASSERT( + (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS); + STATIC_ASSERT( + (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS); + STATIC_ASSERT( + (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS); +} + +/******************************************************************************/ + +/* Table: length slot => length slot base value */ +static const unsigned deflate_length_slot_base[] = { + 3, 4, 5, 6, 7, 8, 9, 10, + 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, + 131, 163, 195, 227, 258, +}; + +/* Table: length slot => number of extra length bits */ +static const u8 deflate_extra_length_bits[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, + 5, 5, 5, 5, 0, +}; + +/* Table: offset slot => offset slot base value */ +static const unsigned deflate_offset_slot_base[] = { + 1, 2, 3, 4, 5, 7, 9, 13, + 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, + 4097, 6145, 8193, 12289, 16385, 24577, +}; + +/* Table: offset slot => number of extra offset bits */ +static const u8 deflate_extra_offset_bits[] = { + 0, 0, 0, 0, 1, 1, 2, 2, + 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, + 11, 11, 12, 12, 13, 13, +}; + +/* Table: length => length slot */ +static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { + 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, + 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, + 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 28, +}; + +/* + * Table: 'offset - 1 => offset_slot' for offset <= 256. + * This was generated by scripts/gen_offset_slot_map.py. + */ +static const u8 deflate_offset_slot[256] = { + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, +}; + +/* The order in which precode codeword lengths are stored */ +static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 +}; + +/* Table: precode symbol => number of extra bits */ +static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 +}; + +/* Codewords for the DEFLATE Huffman codes */ +struct deflate_codewords { + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* + * Codeword lengths (in bits) for the DEFLATE Huffman codes. + * A zero length means the corresponding symbol had zero frequency. + */ +struct deflate_lens { + u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u8 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* Codewords and lengths for the DEFLATE Huffman codes */ +struct deflate_codes { + struct deflate_codewords codewords; + struct deflate_lens lens; +}; + +/* Symbol frequency counters for the DEFLATE Huffman codes */ +struct deflate_freqs { + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* + * Represents a run of literals followed by a match or end-of-block. This + * struct is needed to temporarily store items chosen by the parser, since items + * cannot be written until all items for the block have been chosen and the + * block's Huffman codes have been computed. + */ +struct deflate_sequence { + + /* + * Bits 0..22: the number of literals in this run. This may be 0 and + * can be at most MAX_BLOCK_LENGTH. The literals are not stored + * explicitly in this structure; instead, they are read directly from + * the uncompressed data. + * + * Bits 23..31: the length of the match which follows the literals, or 0 + * if this literal run was the last in the block, so there is no match + * which follows it. + */ +#define SEQ_LENGTH_SHIFT 23 +#define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1) + u32 litrunlen_and_length; + + /* + * If 'length' doesn't indicate end-of-block, then this is the offset of + * the match which follows the literals. + */ + u16 offset; + + /* + * If 'length' doesn't indicate end-of-block, then this is the offset + * slot of the match which follows the literals. + */ + u16 offset_slot; +}; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* Costs for the near-optimal parsing algorithm */ +struct deflate_costs { + + /* The cost to output each possible literal */ + u32 literal[DEFLATE_NUM_LITERALS]; + + /* The cost to output each possible match length */ + u32 length[DEFLATE_MAX_MATCH_LEN + 1]; + + /* The cost to output a match offset of each possible offset slot */ + u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* + * This structure represents a byte position in the input data and a node in the + * graph of possible match/literal choices for the current block. + * + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. + * + * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we + * associate with each node just two pieces of information: + * + * 'cost_to_end' is the minimum cost to reach the end of the block from + * this position. + * + * 'item' represents the literal or match that must be chosen from here to + * reach the end of the block with the minimum cost. Equivalently, this + * can be interpreted as the label of the outgoing edge on the minimum-cost + * path to the "end of block" node from this node. + */ +struct deflate_optimum_node { + + u32 cost_to_end; + + /* + * Notes on the match/literal representation used here: + * + * The low bits of 'item' are the length: 1 if this is a literal, + * or the match length if this is a match. + * + * The high bits of 'item' are the actual literal byte if this is a + * literal, or the match offset if this is a match. + */ +#define OPTIMUM_OFFSET_SHIFT 9 +#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) + u32 item; + +}; + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* Block split statistics. See "Block splitting algorithm" below. */ +#define NUM_LITERAL_OBSERVATION_TYPES 8 +#define NUM_MATCH_OBSERVATION_TYPES 2 +#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \ + NUM_MATCH_OBSERVATION_TYPES) +#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 +struct block_split_stats { + u32 new_observations[NUM_OBSERVATION_TYPES]; + u32 observations[NUM_OBSERVATION_TYPES]; + u32 num_new_observations; + u32 num_observations; +}; + +struct deflate_output_bitstream; + +/* The main DEFLATE compressor structure */ +struct libdeflate_compressor { + + /* Pointer to the compress() implementation chosen at allocation time */ + void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, + size_t in_nbytes, struct deflate_output_bitstream *os); + + /* The compression level with which this compressor was created */ + unsigned compression_level; + + /* Anything of this size or less we won't bother trying to compress. */ + size_t max_passthrough_size; + + /* + * The maximum search depth: consider at most this many potential + * matches at each position + */ + unsigned max_search_depth; + + /* + * The "nice" match length: if a match of this length is found, choose + * it immediately without further consideration + */ + unsigned nice_match_length; + + /* Frequency counters for the current block */ + struct deflate_freqs freqs; + + /* Block split statistics for the current block */ + struct block_split_stats split_stats; + + /* Dynamic Huffman codes for the current block */ + struct deflate_codes codes; + + /* The static Huffman codes defined by the DEFLATE format */ + struct deflate_codes static_codes; + + /* Temporary space for block flushing */ + union { + /* Information about the precode */ + struct { + u32 freqs[DEFLATE_NUM_PRECODE_SYMS]; + u32 codewords[DEFLATE_NUM_PRECODE_SYMS]; + u8 lens[DEFLATE_NUM_PRECODE_SYMS]; + unsigned items[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS]; + unsigned num_litlen_syms; + unsigned num_offset_syms; + unsigned num_explicit_lens; + unsigned num_items; + } precode; + /* + * The "full" length codewords. Used only after the information + * in 'precode' is no longer needed. + */ + struct { + u32 codewords[DEFLATE_MAX_MATCH_LEN + 1]; + u8 lens[DEFLATE_MAX_MATCH_LEN + 1]; + } length; + } o; + + union { + /* Data for greedy or lazy parsing */ + struct { + /* Hash chains matchfinder */ + struct hc_matchfinder hc_mf; + + /* Matches and literals chosen for the current block */ + struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1]; + + } g; /* (g)reedy */ + + /* Data for fastest parsing */ + struct { + /* Hash table matchfinder */ + struct ht_matchfinder ht_mf; + + /* Matches and literals chosen for the current block */ + struct deflate_sequence sequences[ + FAST_SEQ_STORE_LENGTH + 1]; + + } f; /* (f)astest */ + + #if SUPPORT_NEAR_OPTIMAL_PARSING + /* Data for near-optimal parsing */ + struct { + + /* Binary tree matchfinder */ + struct bt_matchfinder bt_mf; + + /* + * Cached matches for the current block. This array + * contains the matches that were found at each position + * in the block. Specifically, for each position, there + * is a list of matches found at that position, if any, + * sorted by strictly increasing length. In addition, + * following the matches for each position, there is a + * special 'struct lz_match' whose 'length' member + * contains the number of matches found at that + * position, and whose 'offset' member contains the + * literal at that position. + * + * Note: in rare cases, there will be a very high number + * of matches in the block and this array will overflow. + * If this happens, we force the end of the current + * block. MATCH_CACHE_LENGTH is the length at which we + * actually check for overflow. The extra slots beyond + * this are enough to absorb the worst case overflow, + * which occurs if starting at + * &match_cache[MATCH_CACHE_LENGTH - 1], we write + * MAX_MATCHES_PER_POS matches and a match count header, + * then skip searching for matches at + * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the + * match count header for each. + */ + struct lz_match match_cache[MATCH_CACHE_LENGTH + + MAX_MATCHES_PER_POS + + DEFLATE_MAX_MATCH_LEN - 1]; + + /* + * Array of nodes, one per position, for running the + * minimum-cost path algorithm. + * + * This array must be large enough to accommodate the + * worst-case number of nodes, which is MAX_BLOCK_LENGTH + * plus 1 for the end-of-block node. + */ + struct deflate_optimum_node optimum_nodes[ + MAX_BLOCK_LENGTH + 1]; + + /* The current cost model being used */ + struct deflate_costs costs; + + /* Saved cost model */ + struct deflate_costs costs_saved; + + /* + * A table that maps match offset to offset slot. This + * differs from deflate_offset_slot[] in that this is a + * full map, not a condensed one. The full map is more + * appropriate for the near-optimal parser, since the + * near-optimal parser does more offset => offset_slot + * translations, it doesn't intersperse them with + * matchfinding (so cache evictions are less of a + * concern), and it uses more memory anyway. + */ + u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; + + /* Literal/match statistics saved from previous block */ + u32 prev_observations[NUM_OBSERVATION_TYPES]; + u32 prev_num_observations; + + /* + * Approximate match length frequencies based on a + * greedy parse, gathered during matchfinding. This is + * used for setting the initial symbol costs. + */ + u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; + u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; + + /* + * The maximum number of optimization passes + * (min-cost path searches) per block. + * Larger values = more compression. + */ + unsigned max_optim_passes; + + /* + * If an optimization pass improves the cost by fewer + * than this number of bits, then optimization will stop + * early, before max_optim_passes has been reached. + * Smaller values = more compression. + */ + unsigned min_improvement_to_continue; + + /* + * The minimum number of bits that would need to be + * saved for it to be considered worth the time to + * regenerate and use the min-cost path from a previous + * optimization pass, in the case where the final + * optimization pass actually increased the cost. + * Smaller values = more compression. + */ + unsigned min_bits_to_use_nonfinal_path; + + /* + * The maximum block length, in uncompressed bytes, at + * which to find and consider the optimal match/literal + * list for the static Huffman codes. This strategy + * improves the compression ratio produced by static + * Huffman blocks and can discover more cases in which + * static blocks are worthwhile. This helps mostly with + * small blocks, hence why this parameter is a max_len. + * + * Above this block length, static Huffman blocks are + * only used opportunistically. I.e. a static Huffman + * block is only used if a static block using the same + * match/literal list as the optimized dynamic block + * happens to be cheaper than the dynamic block itself. + */ + unsigned max_len_to_optimize_static_block; + + } n; /* (n)ear-optimal */ + #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + + } p; /* (p)arser */ +}; + +/* + * The type for the bitbuffer variable, which temporarily holds bits that are + * being packed into bytes and written to the output buffer. For best + * performance, this should have size equal to a machine word. + */ +typedef machine_word_t bitbuf_t; + +/* + * The capacity of the bitbuffer, in bits. This is 1 less than the real size, + * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7. + */ +#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) + +/* + * Can the specified number of bits always be added to 'bitbuf' after any + * pending bytes have been flushed? There can be up to 7 bits remaining after a + * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits. + */ +#define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) + +/* + * Structure to keep track of the current state of sending bits to the + * compressed output buffer + */ +struct deflate_output_bitstream { + + /* Bits that haven't yet been written to the output buffer */ + bitbuf_t bitbuf; + + /* + * Number of bits currently held in @bitbuf. This can be between 0 and + * BITBUF_NBITS in general, or between 0 and 7 after a flush. + */ + unsigned bitcount; + + /* + * Pointer to the position in the output buffer at which the next byte + * should be written + */ + u8 *next; + + /* Pointer to the end of the output buffer */ + u8 *end; + + /* true if the output buffer ran out of space */ + bool overflow; +}; + +/* + * Add some bits to the bitbuffer variable of the output bitstream. The caller + * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS() + * frequently enough. + */ +#define ADD_BITS(bits, n) \ +do { \ + bitbuf |= (bitbuf_t)(bits) << bitcount; \ + bitcount += (n); \ + ASSERT(bitcount <= BITBUF_NBITS); \ +} while (0) + +/* + * Flush bits from the bitbuffer variable to the output buffer. After this, the + * bitbuffer will contain at most 7 bits (a partial byte). + * + * Since deflate_flush_block() verified ahead of time that there is enough space + * remaining before actually writing the block, it's guaranteed that out_next + * won't exceed os->end. However, there might not be enough space remaining to + * flush a whole word, even though that's fastest. Therefore, flush a whole + * word if there is space for it, otherwise flush a byte at a time. + */ +#define FLUSH_BITS() \ +do { \ + if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ + /* Flush a whole word (branchlessly). */ \ + put_unaligned_leword(bitbuf, out_next); \ + bitbuf >>= bitcount & ~7; \ + out_next += bitcount >> 3; \ + bitcount &= 7; \ + } else { \ + /* Flush a byte at a time. */ \ + while (bitcount >= 8) { \ + ASSERT(out_next < os->end); \ + *out_next++ = bitbuf; \ + bitcount -= 8; \ + bitbuf >>= 8; \ + } \ + } \ +} while (0) + +/* + * Given the binary tree node A[subtree_idx] whose children already satisfy the + * maxheap property, swap the node with its greater child until it is greater + * than or equal to both of its children, so that the maxheap property is + * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. + */ +static void +heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) +{ + unsigned parent_idx; + unsigned child_idx; + u32 v; + + v = A[subtree_idx]; + parent_idx = subtree_idx; + while ((child_idx = parent_idx * 2) <= length) { + if (child_idx < length && A[child_idx + 1] > A[child_idx]) + child_idx++; + if (v >= A[child_idx]) + break; + A[parent_idx] = A[child_idx]; + parent_idx = child_idx; + } + A[parent_idx] = v; +} + +/* + * Rearrange the array 'A' so that it satisfies the maxheap property. + * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. + */ +static void +heapify_array(u32 A[], unsigned length) +{ + unsigned subtree_idx; + + for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + heapify_subtree(A, length, subtree_idx); +} + +/* + * Sort the array 'A', which contains 'length' unsigned 32-bit integers. + * + * Note: name this function heap_sort() instead of heapsort() to avoid colliding + * with heapsort() from stdlib.h on BSD-derived systems. + */ +static void +heap_sort(u32 A[], unsigned length) +{ + A--; /* Use 1-based indices */ + + heapify_array(A, length); + + while (length >= 2) { + u32 tmp = A[length]; + + A[length] = A[1]; + A[1] = tmp; + length--; + heapify_subtree(A, length, 1); + } +} + +#define NUM_SYMBOL_BITS 10 +#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define FREQ_MASK (~SYMBOL_MASK) + +#define GET_NUM_COUNTERS(num_syms) (num_syms) + +/* + * Sort the symbols primarily by frequency and secondarily by symbol value. + * Discard symbols with zero frequency and fill in an array with the remaining + * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each + * array entry will contain the symbol value, and the remaining bits will + * contain the frequency. + * + * @num_syms + * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. + * + * @freqs[num_syms] + * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. + * + * @lens[num_syms] + * An array that eventually will hold the length of each codeword. This + * function only fills in the codeword lengths for symbols that have zero + * frequency, which are not well defined per se but will be set to 0. + * + * @symout[num_syms] + * The output array, described above. + * + * Returns the number of entries in 'symout' that were filled. This is the + * number of symbols that have nonzero frequency. + */ +static unsigned +sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) +{ + unsigned sym; + unsigned i; + unsigned num_used_syms; + unsigned num_counters; + unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; + + /* + * We use heapsort, but with an added optimization. Since often most + * symbol frequencies are low, we first do a count sort using a limited + * number of counters. High frequencies are counted in the last + * counter, and only they will be sorted with heapsort. + * + * Note: with more symbols, it is generally beneficial to have more + * counters. About 1 counter per symbol seems fastest. + */ + + num_counters = GET_NUM_COUNTERS(num_syms); + + __builtin_memset(counters, 0, num_counters * sizeof(counters[0])); + + /* Count the frequencies. */ + for (sym = 0; sym < num_syms; sym++) + counters[MIN(freqs[sym], num_counters - 1)]++; + + /* + * Make the counters cumulative, ignoring the zero-th, which counted + * symbols with zero frequency. As a side effect, this calculates the + * number of symbols with nonzero frequency. + */ + num_used_syms = 0; + for (i = 1; i < num_counters; i++) { + unsigned count = counters[i]; + + counters[i] = num_used_syms; + num_used_syms += count; + } + + /* + * Sort nonzero-frequency symbols using the counters. At the same time, + * set the codeword lengths of zero-frequency symbols to 0. + */ + for (sym = 0; sym < num_syms; sym++) { + u32 freq = freqs[sym]; + + if (freq != 0) { + symout[counters[MIN(freq, num_counters - 1)]++] = + sym | (freq << NUM_SYMBOL_BITS); + } else { + lens[sym] = 0; + } + } + + /* Sort the symbols counted in the last counter. */ + heap_sort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); + + return num_used_syms; +} + +/* + * Build a Huffman tree. + * + * This is an optimized implementation that + * (a) takes advantage of the frequencies being already sorted; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman + * tree are sufficient to generate a canonical code; + * (c) Only stores parent pointers, not child pointers; + * (d) Produces the nodes in the same memory used for input frequency + * information. + * + * Array 'A', which contains 'sym_count' entries, is used for both input and + * output. For this function, 'sym_count' must be at least 2. + * + * For input, the array must contain the frequencies of the symbols, sorted in + * increasing order. Specifically, each entry must contain a frequency left + * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of + * the entries will be ignored by this function. Although these bits will, in + * fact, contain the symbols that correspond to the frequencies, this function + * is concerned with frequencies only and keeps the symbols as-is. + * + * For output, this function will produce the non-leaf nodes of the Huffman + * tree. These nodes will be stored in the first (sym_count - 1) entries of the + * array. Entry A[sym_count - 2] will represent the root node. Each other node + * will contain the zero-based index of its parent node in 'A', left shifted by + * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will + * be kept as-is. Again, note that although these low bits will, in fact, + * contain a symbol value, this symbol will have *no relationship* with the + * Huffman tree node that happens to occupy the same slot. This is because this + * implementation only generates the non-leaf nodes of the tree. + */ +static void +build_tree(u32 A[], unsigned sym_count) +{ + const unsigned last_idx = sym_count - 1; + + /* Index of the next lowest frequency leaf that still needs a parent */ + unsigned i = 0; + + /* + * Index of the next lowest frequency non-leaf that still needs a + * parent, or 'e' if there is currently no such node + */ + unsigned b = 0; + + /* Index of the next spot for a non-leaf (will overwrite a leaf) */ + unsigned e = 0; + + do { + u32 new_freq; + + /* + * Select the next two lowest frequency nodes among the leaves + * A[i] and non-leaves A[b], and create a new node A[e] to be + * their parent. Set the new node's frequency to the sum of the + * frequencies of its two children. + * + * Usually the next two lowest frequency nodes are of the same + * type (leaf or non-leaf), so check those cases first. + */ + if (i + 1 <= last_idx && + (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { + /* Two leaves */ + new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); + i += 2; + } else if (b + 2 <= e && + (i > last_idx || + (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { + /* Two non-leaves */ + new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + A[b + 1] = (e << NUM_SYMBOL_BITS) | + (A[b + 1] & SYMBOL_MASK); + b += 2; + } else { + /* One leaf and one non-leaf */ + new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + i++; + b++; + } + A[e] = new_freq | (A[e] & SYMBOL_MASK); + /* + * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the + * tree is complete once we've created 'n - 1' non-leaves. + */ + } while (++e < last_idx); +} + +/* + * Given the stripped-down Huffman tree constructed by build_tree(), determine + * the number of codewords that should be assigned each possible length, taking + * into account the length-limited constraint. + * + * @A + * The array produced by build_tree(), containing parent index information + * for the non-leaf nodes of the Huffman tree. Each entry in this array is + * a node; a node's parent always has a greater index than that node + * itself. This function will overwrite the parent index information in + * this array, so essentially it will destroy the tree. However, the data + * in the low NUM_SYMBOL_BITS of each entry will be preserved. + * + * @root_idx + * The 0-based index of the root node in 'A', and consequently one less + * than the number of tree node entries in 'A'. (Or, really 2 less than + * the actual length of 'A'.) + * + * @len_counts + * An array of length ('max_codeword_len' + 1) in which the number of + * codewords having each length <= max_codeword_len will be returned. + * + * @max_codeword_len + * The maximum permissible codeword length. + */ +static void +compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], + unsigned max_codeword_len) +{ + unsigned len; + int node; + + /* + * The key observations are: + * + * (1) We can traverse the non-leaf nodes of the tree, always visiting a + * parent before its children, by simply iterating through the array + * in reverse order. Consequently, we can compute the depth of each + * node in one pass, overwriting the parent indices with depths. + * + * (2) We can initially assume that in the real Huffman tree, both + * children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) node + * during the traversal, we modify this assumption to account for + * the current node *not* being a leaf, but rather its two children + * being leaves. This causes the loss of one codeword for the + * current depth and the addition of two codewords for the current + * depth plus one. + * + * (3) We can handle the length-limited constraint fairly easily by + * simply using the largest length available when a depth exceeds + * max_codeword_len. + */ + + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + len_counts[1] = 2; + + /* Set the root node's depth to 0. */ + A[root_idx] &= SYMBOL_MASK; + + for (node = root_idx - 1; node >= 0; node--) { + + /* Calculate the depth of this node. */ + + unsigned parent = A[node] >> NUM_SYMBOL_BITS; + unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; + unsigned depth = parent_depth + 1; + + /* + * Set the depth of this node so that it is available when its + * children (if any) are processed. + */ + A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); + + /* + * If needed, decrease the length to meet the length-limited + * constraint. This is not the optimal method for generating + * length-limited Huffman codes! But it should be good enough. + */ + if (depth >= max_codeword_len) { + depth = max_codeword_len; + do { + depth--; + } while (len_counts[depth] == 0); + } + + /* + * Account for the fact that we have a non-leaf node at the + * current depth. + */ + len_counts[depth]--; + len_counts[depth + 1] += 2; + } +} + +/* + * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords + * after generating them. All codewords have length <= 16 bits. If the CPU has + * a bit-reversal instruction, then that is the fastest method. Otherwise the + * fastest method is to reverse the bits in each of the two bytes using a table. + * The table method is slightly faster than using bitwise operations to flip + * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed + * into a machine word and processed together using that method. + */ + +#ifdef rbit32 +static u32 reverse_codeword(u32 codeword, u8 len) +{ + return rbit32(codeword) >> ((32 - len) & 31); +} +#else +/* Generated by scripts/gen_bitreverse_tab.py */ +static const u8 bitreverse_tab[256] = { + 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, + 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, + 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, + 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, + 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, + 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, + 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, + 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, + 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, + 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, + 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, + 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, + 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, + 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, + 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, + 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, + 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, + 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, + 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, + 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, + 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, + 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, + 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, + 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, + 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, + 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, + 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, + 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, + 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, + 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, + 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, + 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, +}; + +static u32 reverse_codeword(u32 codeword, u8 len) +{ + STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); + codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | + bitreverse_tab[codeword >> 8]; + return codeword >> (16 - len); +} +#endif /* !rbit32 */ + +/* + * Generate the codewords for a canonical Huffman code. + * + * @A + * The output array for codewords. In addition, initially this + * array must contain the symbols, sorted primarily by frequency and + * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of + * each entry. + * + * @len + * Output array for codeword lengths. + * + * @len_counts + * An array that provides the number of codewords that will have + * each possible length <= max_codeword_len. + * + * @max_codeword_len + * Maximum length, in bits, of each codeword. + * + * @num_syms + * Number of symbols in the alphabet, including symbols with zero + * frequency. This is the length of the 'A' and 'len' arrays. + */ +static void +gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], + unsigned max_codeword_len, unsigned num_syms) +{ + u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned i; + unsigned len; + unsigned sym; + + /* + * Given the number of codewords that will have each length, assign + * codeword lengths to symbols. We do this by assigning the lengths in + * decreasing order to the symbols sorted primarily by increasing + * frequency and secondarily by increasing symbol value. + */ + for (i = 0, len = max_codeword_len; len >= 1; len--) { + unsigned count = len_counts[len]; + + while (count--) + lens[A[i++] & SYMBOL_MASK] = len; + } + + /* + * Generate the codewords themselves. We initialize the + * 'next_codewords' array to provide the lexicographically first + * codeword of each length, then assign codewords in symbol order. This + * produces a canonical code. + */ + next_codewords[0] = 0; + next_codewords[1] = 0; + for (len = 2; len <= max_codeword_len; len++) + next_codewords[len] = + (next_codewords[len - 1] + len_counts[len - 1]) << 1; + + for (sym = 0; sym < num_syms; sym++) { + /* DEFLATE requires bit-reversed codewords. */ + A[sym] = reverse_codeword(next_codewords[lens[sym]]++, + lens[sym]); + } +} + +/* + * --------------------------------------------------------------------- + * deflate_make_huffman_code() + * --------------------------------------------------------------------- + * + * Given an alphabet and the frequency of each symbol in it, construct a + * length-limited canonical Huffman code. + * + * @num_syms + * The number of symbols in the alphabet. The symbols are the integers in + * the range [0, num_syms - 1]. This parameter must be at least 2 and + * must not exceed (1 << NUM_SYMBOL_BITS). + * + * @max_codeword_len + * The maximum permissible codeword length. + * + * @freqs + * An array of length @num_syms that gives the frequency of each symbol. + * It is valid for some, none, or all of the frequencies to be 0. The sum + * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. + * + * @lens + * An array of @num_syms entries in which this function will return the + * length, in bits, of the codeword assigned to each symbol. Symbols with + * 0 frequency will not have codewords per se, but their entries in this + * array will be set to 0. No lengths greater than @max_codeword_len will + * be assigned. + * + * @codewords + * An array of @num_syms entries in which this function will return the + * codeword for each symbol, right-justified and padded on the left with + * zeroes. Codewords for symbols with 0 frequency will be undefined. + * + * --------------------------------------------------------------------- + * + * This function builds a length-limited canonical Huffman code. + * + * A length-limited Huffman code contains no codewords longer than some + * specified length, and has exactly (with some algorithms) or approximately + * (with the algorithm used here) the minimum weighted path length from the + * root, given this constraint. + * + * A canonical Huffman code satisfies the properties that a longer codeword + * never lexicographically precedes a shorter codeword, and the lexicographic + * ordering of codewords of the same length is the same as the lexicographic + * ordering of the corresponding symbols. A canonical Huffman code, or more + * generally a canonical prefix code, can be reconstructed from only a list + * containing the codeword length of each symbol. + * + * The classic algorithm to generate a Huffman code creates a node for each + * symbol, then inserts these nodes into a min-heap keyed by symbol frequency. + * Then, repeatedly, the two lowest-frequency nodes are removed from the + * min-heap and added as the children of a new node having frequency equal to + * the sum of its two children, which is then inserted into the min-heap. When + * only a single node remains in the min-heap, it is the root of the Huffman + * tree. The codeword for each symbol is determined by the path needed to reach + * the corresponding node from the root. Descending to the left child appends a + * 0 bit, whereas descending to the right child appends a 1 bit. + * + * The classic algorithm is relatively easy to understand, but it is subject to + * a number of inefficiencies. In practice, it is fastest to first sort the + * symbols by frequency. (This itself can be subject to an optimization based + * on the fact that most frequencies tend to be low.) At the same time, we sort + * secondarily by symbol value, which aids the process of generating a canonical + * code. Then, during tree construction, no heap is necessary because both the + * leaf nodes and the unparented non-leaf nodes can be easily maintained in + * sorted order. Consequently, there can never be more than two possibilities + * for the next-lowest-frequency node. + * + * In addition, because we're generating a canonical code, we actually don't + * need the leaf nodes of the tree at all, only the non-leaf nodes. This is + * because for canonical code generation we don't need to know where the symbols + * are in the tree. Rather, we only need to know how many leaf nodes have each + * depth (codeword length). And this information can, in fact, be quickly + * generated from the tree of non-leaves only. + * + * Furthermore, we can build this stripped-down Huffman tree directly in the + * array in which the codewords are to be generated, provided that these array + * slots are large enough to hold a symbol and frequency value. + * + * Still furthermore, we don't even need to maintain explicit child pointers. + * We only need the parent pointers, and even those can be overwritten in-place + * with depth information as part of the process of extracting codeword lengths + * from the tree. So in summary, we do NOT need a big structure like: + * + * struct huffman_tree_node { + * unsigned int symbol; + * unsigned int frequency; + * unsigned int depth; + * struct huffman_tree_node *left_child; + * struct huffman_tree_node *right_child; + * }; + * + * + * ... which often gets used in "naive" implementations of Huffman code + * generation. + * + * Many of these optimizations are based on the implementation in 7-Zip (source + * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov. + */ +static void +deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[], u8 lens[], u32 codewords[]) +{ + u32 *A = codewords; + unsigned num_used_syms; + + STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); + STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); + + /* + * We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array used for + * this purpose ('A') shares storage with the space in which we will + * eventually return the codewords. + */ + num_used_syms = sort_symbols(num_syms, freqs, lens, A); + /* + * 'num_used_syms' is the number of symbols with nonzero frequency. + * This may be less than @num_syms. 'num_used_syms' is also the number + * of entries in 'A' that are valid. Each entry consists of a distinct + * symbol and a nonzero frequency packed into a 32-bit integer. + */ + + /* + * A complete Huffman code must contain at least 2 codewords. Yet, it's + * possible that fewer than 2 symbols were used. When this happens, + * it's usually for the offset code (0-1 symbols used). But it's also + * theoretically possible for the litlen and pre codes (1 symbol used). + * + * The DEFLATE RFC explicitly allows the offset code to contain just 1 + * codeword, or even be completely empty. But it's silent about the + * other codes. It also doesn't say whether, in the 1-codeword case, + * the codeword (which it says must be 1 bit) is '0' or '1'. + * + * In any case, some DEFLATE decompressors reject these cases. zlib + * generally allows them, but it does reject precodes that have just 1 + * codeword. More problematically, zlib v1.2.1 and earlier rejected + * empty offset codes, and this behavior can also be seen in Windows + * Explorer's ZIP unpacker (supposedly even still in Windows 11). + * + * Other DEFLATE compressors, including zlib, always send at least 2 + * codewords in order to make a complete Huffman code. Therefore, this + * is a case where practice does not entirely match the specification. + * We follow practice by generating 2 codewords of length 1: codeword + * '0' for symbol 0, and codeword '1' for another symbol -- the used + * symbol if it exists and is not symbol 0, otherwise symbol 1. This + * does worsen the compression ratio by having to send an unnecessary + * offset codeword length. But this only affects rare cases such as + * blocks containing all literals, and it only makes a tiny difference. + */ + if (unlikely(num_used_syms < 2)) { + unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; + unsigned nonzero_idx = sym ? sym : 1; + + codewords[0] = 0; + lens[0] = 1; + codewords[nonzero_idx] = 1; + lens[nonzero_idx] = 1; + return; + } + + /* + * Build a stripped-down version of the Huffman tree, sharing the array + * 'A' with the symbol values. Then extract length counts from the tree + * and use them to generate the final codewords. + */ + + build_tree(A, num_used_syms); + + { + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + + compute_length_counts(A, num_used_syms - 2, + len_counts, max_codeword_len); + + gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); + } +} + +/* + * Clear the Huffman symbol frequency counters. This must be called when + * starting a new DEFLATE block. + */ +static void +deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) +{ + __builtin_memset(&c->freqs, 0, sizeof(c->freqs)); +} + +/* + * Build the literal/length and offset Huffman codes for a DEFLATE block. + * + * This takes as input the frequency tables for each alphabet and produces as + * output a set of tables that map symbols to codewords and codeword lengths. + */ +static void +deflate_make_huffman_codes(const struct deflate_freqs *freqs, + struct deflate_codes *codes) +{ + deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, + MAX_LITLEN_CODEWORD_LEN, + freqs->litlen, + codes->lens.litlen, + codes->codewords.litlen); + + deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, + MAX_OFFSET_CODEWORD_LEN, + freqs->offset, + codes->lens.offset, + codes->codewords.offset); +} + +/* Initialize c->static_codes. */ +static void +deflate_init_static_codes(struct libdeflate_compressor *c) +{ + unsigned i; + + for (i = 0; i < 144; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + for (; i < 256; i++) + c->freqs.litlen[i] = 1 << (9 - 9); + for (; i < 280; i++) + c->freqs.litlen[i] = 1 << (9 - 7); + for (; i < 288; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + + for (i = 0; i < 32; i++) + c->freqs.offset[i] = 1 << (5 - 5); + + deflate_make_huffman_codes(&c->freqs, &c->static_codes); +} + +/* Return the offset slot for the given match offset, using the small map. */ +static unsigned +deflate_get_offset_slot(u32 offset) +{ + /* + * 1 <= offset <= 32768 here. For 1 <= offset <= 256, + * deflate_offset_slot[offset - 1] gives the slot. + * + * For 257 <= offset <= 32768, we take advantage of the fact that 257 is + * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == + * 128 times larger than each slot [2..16) (since the number of extra + * bits increases by 1 every 2 slots). Thus, the slot is: + * + * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) + * == deflate_offset_slot[((offset - 1) >> 7)] + 14 + * + * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: + * + * deflate_offset_slot[(offset - 1) >> n] + (n << 1) + * + * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with + * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. + */ + unsigned n = (256 - offset) >> 29; + + return deflate_offset_slot[(offset - 1) >> n] + (n << 1); +} + +static unsigned +deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, + u32 precode_freqs[], unsigned precode_items[]) +{ + unsigned *itemptr; + unsigned run_start; + unsigned run_end; + unsigned extra_bits; + u8 len; + + __builtin_memset(precode_freqs, 0, + DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); + + itemptr = precode_items; + run_start = 0; + do { + /* Find the next run of codeword lengths. */ + + /* len = the length being repeated */ + len = lens[run_start]; + + /* Extend the run. */ + run_end = run_start; + do { + run_end++; + } while (run_end != num_lens && len == lens[run_end]); + + if (len == 0) { + /* Run of zeroes. */ + + /* Symbol 18: RLE 11 to 138 zeroes at a time. */ + while ((run_end - run_start) >= 11) { + extra_bits = MIN((run_end - run_start) - 11, + 0x7F); + precode_freqs[18]++; + *itemptr++ = 18 | (extra_bits << 5); + run_start += 11 + extra_bits; + } + + /* Symbol 17: RLE 3 to 10 zeroes at a time. */ + if ((run_end - run_start) >= 3) { + extra_bits = MIN((run_end - run_start) - 3, + 0x7); + precode_freqs[17]++; + *itemptr++ = 17 | (extra_bits << 5); + run_start += 3 + extra_bits; + } + } else { + + /* A run of nonzero lengths. */ + + /* Symbol 16: RLE 3 to 6 of the previous length. */ + if ((run_end - run_start) >= 4) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + do { + extra_bits = MIN((run_end - run_start) - + 3, 0x3); + precode_freqs[16]++; + *itemptr++ = 16 | (extra_bits << 5); + run_start += 3 + extra_bits; + } while ((run_end - run_start) >= 3); + } + } + + /* Output any remaining lengths without RLE. */ + while (run_start != run_end) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + } + } while (run_start != num_lens); + + return itemptr - precode_items; +} + +/* + * Huffman codeword lengths for dynamic Huffman blocks are compressed using a + * separate Huffman code, the "precode", which contains a symbol for each + * possible codeword length in the larger code as well as several special + * symbols to represent repeated codeword lengths (a form of run-length + * encoding). The precode is itself constructed in canonical form, and its + * codeword lengths are represented literally in 19 3-bit fields that + * immediately precede the compressed codeword lengths of the larger code. + */ + +/* Precompute the information needed to output dynamic Huffman codes. */ +static void +deflate_precompute_huffman_header(struct libdeflate_compressor *c) +{ + /* Compute how many litlen and offset symbols are needed. */ + + for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; + c->o.precode.num_litlen_syms > 257; + c->o.precode.num_litlen_syms--) + if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0) + break; + + for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; + c->o.precode.num_offset_syms > 1; + c->o.precode.num_offset_syms--) + if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0) + break; + + /* + * If we're not using the full set of literal/length codeword lengths, + * then temporarily move the offset codeword lengths over so that the + * literal/length and offset codeword lengths are contiguous. + */ + STATIC_ASSERT(offsetof(struct deflate_lens, offset) == + DEFLATE_NUM_LITLEN_SYMS); + if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + __builtin_memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, + (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, + c->o.precode.num_offset_syms); + } + + /* + * Compute the "items" (RLE / literal tokens and extra bits) with which + * the codeword lengths in the larger code will be output. + */ + c->o.precode.num_items = + deflate_compute_precode_items((u8 *)&c->codes.lens, + c->o.precode.num_litlen_syms + + c->o.precode.num_offset_syms, + c->o.precode.freqs, + c->o.precode.items); + + /* Build the precode. */ + deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, + MAX_PRE_CODEWORD_LEN, + c->o.precode.freqs, c->o.precode.lens, + c->o.precode.codewords); + + /* Count how many precode lengths we actually need to output. */ + for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; + c->o.precode.num_explicit_lens > 4; + c->o.precode.num_explicit_lens--) + if (c->o.precode.lens[deflate_precode_lens_permutation[ + c->o.precode.num_explicit_lens - 1]] != 0) + break; + + /* Restore the offset codeword lengths if needed. */ + if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + __builtin_memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, + (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, + c->o.precode.num_offset_syms); + } +} + +/* + * To make it faster to output matches, compute the "full" match length + * codewords, i.e. the concatenation of the litlen codeword and the extra bits + * for each possible match length. + */ +static void +deflate_compute_full_len_codewords(struct libdeflate_compressor *c, + const struct deflate_codes *codes) +{ + unsigned len; + + STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32); + + for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) { + unsigned slot = deflate_length_slot[len]; + unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot; + u32 extra_bits = len - deflate_length_slot_base[slot]; + + c->o.length.codewords[len] = + codes->codewords.litlen[litlen_sym] | + (extra_bits << codes->lens.litlen[litlen_sym]); + c->o.length.lens[len] = codes->lens.litlen[litlen_sym] + + deflate_extra_length_bits[slot]; + } +} + +/* Write a match to the output buffer. */ +#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_) \ +do { \ + const struct libdeflate_compressor *c__ = (c_); \ + const struct deflate_codes *codes__ = (codes_); \ + unsigned length__ = (length_); \ + unsigned offset__ = (offset_); \ + unsigned offset_slot__ = (offset_slot_); \ + \ + /* Litlen symbol and extra length bits */ \ + STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ + ADD_BITS(c__->o.length.codewords[length__], \ + c__->o.length.lens[length__]); \ + \ + if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS + \ + MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ + FLUSH_BITS(); \ + \ + /* Offset symbol */ \ + ADD_BITS(codes__->codewords.offset[offset_slot__], \ + codes__->lens.offset[offset_slot__]); \ + \ + if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ + FLUSH_BITS(); \ + \ + /* Extra offset bits */ \ + ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ + deflate_extra_offset_bits[offset_slot__]); \ + \ + FLUSH_BITS(); \ +} while (0) + +/* + * Choose the best type of block to use (dynamic Huffman, static Huffman, or + * uncompressed), then output it. + * + * The uncompressed data of the block is @block_begin[0..@block_length-1]. The + * sequence of literals and matches that will be used to compress the block (if + * a compressed block is chosen) is given by @sequences if it's non-NULL, or + * else @c->p.n.optimum_nodes. @c->freqs and @c->codes must be already set + * according to the literals, matches, and end-of-block symbol. + */ +static void +deflate_flush_block(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct deflate_sequence *sequences, + bool is_final_block) +{ + /* + * It is hard to get compilers to understand that writes to 'os->next' + * don't alias 'os'. That hurts performance significantly, as + * everything in 'os' would keep getting re-loaded. ('restrict' + * *should* do the trick, but it's unreliable.) Therefore, we keep all + * the output bitstream state in local variables, and output bits using + * macros. This is similar to what the decompressor does. + */ + const u8 *in_next = block_begin; + const u8 * const in_end = block_begin + block_length; + bitbuf_t bitbuf = os->bitbuf; + unsigned bitcount = os->bitcount; + u8 *out_next = os->next; + u8 * const out_fast_end = + os->end - MIN(WORDBYTES - 1, os->end - out_next); + /* + * The cost for each block type, in bits. Start with the cost of the + * block header which is 3 bits. + */ + u32 dynamic_cost = 3; + u32 static_cost = 3; + u32 uncompressed_cost = 3; + u32 best_cost; + struct deflate_codes *codes; + unsigned sym; + + ASSERT(block_length >= MIN_BLOCK_LENGTH || + (is_final_block && block_length > 0)); + ASSERT(block_length <= MAX_BLOCK_LENGTH); + ASSERT(bitcount <= 7); + ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); + ASSERT(out_next <= os->end); + ASSERT(!os->overflow); + + /* Precompute the precode items and build the precode. */ + deflate_precompute_huffman_header(c); + + /* Account for the cost of encoding dynamic Huffman codes. */ + dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + u32 extra = deflate_extra_precode_bits[sym]; + + dynamic_cost += c->o.precode.freqs[sym] * + (extra + c->o.precode.lens[sym]); + } + + /* Account for the cost of encoding literals. */ + for (sym = 0; sym < 144; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; + static_cost += c->freqs.litlen[sym] * 8; + } + for (; sym < 256; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; + static_cost += c->freqs.litlen[sym] * 9; + } + + /* Account for the cost of encoding the end-of-block symbol. */ + dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; + static_cost += 7; + + /* Account for the cost of encoding lengths. */ + for (sym = DEFLATE_FIRST_LEN_SYM; + sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); + sym++) { + u32 extra = deflate_extra_length_bits[ + sym - DEFLATE_FIRST_LEN_SYM]; + + dynamic_cost += c->freqs.litlen[sym] * + (extra + c->codes.lens.litlen[sym]); + static_cost += c->freqs.litlen[sym] * + (extra + c->static_codes.lens.litlen[sym]); + } + + /* Account for the cost of encoding offsets. */ + for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { + u32 extra = deflate_extra_offset_bits[sym]; + + dynamic_cost += c->freqs.offset[sym] * + (extra + c->codes.lens.offset[sym]); + static_cost += c->freqs.offset[sym] * (extra + 5); + } + + /* Compute the cost of using uncompressed blocks. */ + uncompressed_cost += (-(bitcount + 3) & 7) + 32 + + (40 * (DIV_ROUND_UP(block_length, + UINT16_MAX) - 1)) + + (8 * block_length); + + /* + * Choose and output the cheapest type of block. If there is a tie, + * prefer uncompressed, then static, then dynamic. + */ + + best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); + + /* If the block isn't going to fit, then stop early. */ + if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) { + os->overflow = true; + return; + } + /* + * Else, now we know that the block fits, so no further bounds checks on + * the output buffer are required until the next block. + */ + + if (best_cost == uncompressed_cost) { + /* + * Uncompressed block(s). DEFLATE limits the length of + * uncompressed blocks to UINT16_MAX bytes, so if the length of + * the "block" we're flushing is over UINT16_MAX, we actually + * output multiple blocks. + */ + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; + + if (in_end - in_next <= UINT16_MAX) { + bfinal = is_final_block; + len = in_end - in_next; + } + /* It was already checked that there is enough space. */ + ASSERT(os->end - out_next >= + DIV_ROUND_UP(bitcount + 3, 8) + 4 + len); + /* + * Output BFINAL (1 bit) and BTYPE (2 bits), then align + * to a byte boundary. + */ + STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); + *out_next++ = (bfinal << bitcount) | bitbuf; + if (bitcount > 5) + *out_next++ = 0; + bitbuf = 0; + bitcount = 0; + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + __builtin_memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + /* Done outputting uncompressed block(s) */ + goto out; + } + + if (best_cost == static_cost) { + /* Static Huffman block */ + codes = &c->static_codes; + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); + FLUSH_BITS(); + } else { + const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; + const unsigned num_precode_items = c->o.precode.num_items; + unsigned precode_sym, precode_item; + unsigned i; + + /* Dynamic Huffman block */ + + codes = &c->codes; + STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); + ADD_BITS(c->o.precode.num_litlen_syms - 257, 5); + ADD_BITS(c->o.precode.num_offset_syms - 1, 5); + ADD_BITS(num_explicit_lens - 4, 4); + + /* Output the lengths of the codewords in the precode. */ + if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + /* + * A 64-bit bitbuffer is just one bit too small to hold + * the maximum number of precode lens, so to minimize + * flushes we merge one len with the previous fields. + */ + precode_sym = deflate_precode_lens_permutation[0]; + ADD_BITS(c->o.precode.lens[precode_sym], 3); + FLUSH_BITS(); + i = 1; /* num_explicit_lens >= 4 */ + do { + precode_sym = + deflate_precode_lens_permutation[i]; + ADD_BITS(c->o.precode.lens[precode_sym], 3); + } while (++i < num_explicit_lens); + FLUSH_BITS(); + } else { + FLUSH_BITS(); + i = 0; + do { + precode_sym = + deflate_precode_lens_permutation[i]; + ADD_BITS(c->o.precode.lens[precode_sym], 3); + FLUSH_BITS(); + } while (++i < num_explicit_lens); + } + + /* + * Output the lengths of the codewords in the litlen and offset + * codes, encoded by the precode. + */ + i = 0; + do { + precode_item = c->o.precode.items[i]; + precode_sym = precode_item & 0x1F; + STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); + ADD_BITS(c->o.precode.codewords[precode_sym], + c->o.precode.lens[precode_sym]); + ADD_BITS(precode_item >> 5, + deflate_extra_precode_bits[precode_sym]); + FLUSH_BITS(); + } while (++i < num_precode_items); + } + + /* Output the literals and matches for a dynamic or static block. */ + ASSERT(bitcount <= 7); + deflate_compute_full_len_codewords(c, codes); +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (sequences == NULL) { + /* Output the literals and matches from the minimum-cost path */ + struct deflate_optimum_node *cur_node = + &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node * const end_node = + &c->p.n.optimum_nodes[block_length]; + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> + OPTIMUM_OFFSET_SHIFT; + if (length == 1) { + /* Literal */ + ADD_BITS(codes->codewords.litlen[offset], + codes->lens.litlen[offset]); + FLUSH_BITS(); + } else { + /* Match */ + WRITE_MATCH(c, codes, length, offset, + c->p.n.offset_slot_full[offset]); + } + cur_node += length; + } while (cur_node != end_node); + } else +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + { + /* Output the literals and matches from the sequences list. */ + const struct deflate_sequence *seq; + + for (seq = sequences; ; seq++) { + u32 litrunlen = seq->litrunlen_and_length & + SEQ_LITRUNLEN_MASK; + unsigned length = seq->litrunlen_and_length >> + SEQ_LENGTH_SHIFT; + unsigned lit; + + /* Output a run of literals. */ + if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { + for (; litrunlen >= 4; litrunlen -= 4) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + FLUSH_BITS(); + } + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + } + } + FLUSH_BITS(); + } + } else { + while (litrunlen--) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + FLUSH_BITS(); + } + } + + if (length == 0) { /* Last sequence? */ + ASSERT(in_next == in_end); + break; + } + + /* Output a match. */ + WRITE_MATCH(c, codes, length, seq->offset, + seq->offset_slot); + in_next += length; + } + } + + /* Output the end-of-block symbol. */ + ASSERT(bitcount <= 7); + ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], + codes->lens.litlen[DEFLATE_END_OF_BLOCK]); + FLUSH_BITS(); +out: + ASSERT(bitcount <= 7); + /* + * Assert that the block cost was computed correctly. This is relied on + * above for the bounds check on the output buffer. Also, + * libdeflate_deflate_compress_bound() relies on this via the assumption + * that uncompressed blocks will always be used when cheapest. + */ + ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); + os->bitbuf = bitbuf; + os->bitcount = bitcount; + os->next = out_next; +} + +static void +deflate_finish_block(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct deflate_sequence *sequences, + bool is_final_block) +{ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_flush_block(c, os, block_begin, block_length, sequences, + is_final_block); +} + +/******************************************************************************/ + +/* + * Block splitting algorithm. The problem is to decide when it is worthwhile to + * start a new block with new Huffman codes. There is a theoretically optimal + * solution: recursively consider every possible block split, considering the + * exact cost of each block, and choose the minimum cost approach. But this is + * far too slow. Instead, as an approximation, we can count symbols and after + * every N symbols, compare the expected distribution of symbols based on the + * previous data with the actual distribution. If they differ "by enough", then + * start a new block. + * + * As an optimization and heuristic, we don't distinguish between every symbol + * but rather we combine many symbols into a single "observation type". For + * literals we only look at the high bits and low bits, and for matches we only + * look at whether the match is long or not. The assumption is that for typical + * "real" data, places that are good block boundaries will tend to be noticeable + * based only on changes in these aggregate probabilities, without looking for + * subtle differences in individual symbols. For example, a change from ASCII + * bytes to non-ASCII bytes, or from few matches (generally less compressible) + * to many matches (generally more compressible), would be easily noticed based + * on the aggregates. + * + * For determining whether the probability distributions are "different enough" + * to start a new block, the simple heuristic of splitting when the sum of + * absolute differences exceeds a constant seems to be good enough. We also add + * a number proportional to the block length so that the algorithm is more + * likely to end long blocks than short blocks. This reflects the general + * expectation that it will become increasingly beneficial to start a new block + * as the current block grows longer. + * + * Finally, for an approximation, it is not strictly necessary that the exact + * symbols being used are considered. With "near-optimal parsing", for example, + * the actual symbols that will be used are unknown until after the block + * boundary is chosen and the block has been optimized. Since the final choices + * cannot be used, we can use preliminary "greedy" choices instead. + */ + +/* Initialize the block split statistics when starting a new block. */ +static void +init_block_split_stats(struct block_split_stats *stats) +{ + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->new_observations[i] = 0; + stats->observations[i] = 0; + } + stats->num_new_observations = 0; + stats->num_observations = 0; +} + +/* + * Literal observation. Heuristic: use the top 2 bits and low 1 bits of the + * literal, for 8 possible literal observation types. + */ +static void +observe_literal(struct block_split_stats *stats, u8 lit) +{ + stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; + stats->num_new_observations++; +} + +/* + * Match observation. Heuristic: use one observation type for "short match" and + * one observation type for "long match". + */ +static void +observe_match(struct block_split_stats *stats, unsigned length) +{ + stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + + (length >= 9)]++; + stats->num_new_observations++; +} + +static void +merge_new_observations(struct block_split_stats *stats) +{ + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->observations[i] += stats->new_observations[i]; + stats->new_observations[i] = 0; + } + stats->num_observations += stats->num_new_observations; + stats->num_new_observations = 0; +} + +static bool +do_end_block_check(struct block_split_stats *stats, u32 block_length) +{ + if (stats->num_observations > 0) { + /* + * Compute the sum of absolute differences of probabilities. To + * avoid needing to use floating point arithmetic or do slow + * divisions, we do all arithmetic with the probabilities + * multiplied by num_observations * num_new_observations. E.g., + * for the "old" observations the probabilities would be + * (double)observations[i] / num_observations, but since we + * multiply by both num_observations and num_new_observations we + * really do observations[i] * num_new_observations. + */ + u32 total_delta = 0; + u32 num_items; + u32 cutoff; + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + u32 expected = stats->observations[i] * + stats->num_new_observations; + u32 actual = stats->new_observations[i] * + stats->num_observations; + u32 delta = (actual > expected) ? actual - expected : + expected - actual; + + total_delta += delta; + } + + num_items = stats->num_observations + + stats->num_new_observations; + /* + * Heuristic: the cutoff is when the sum of absolute differences + * of probabilities becomes at least 200/512. As above, the + * probability is multiplied by both num_new_observations and + * num_observations. Be careful to avoid integer overflow. + */ + cutoff = stats->num_new_observations * 200 / 512 * + stats->num_observations; + /* + * Very short blocks have a lot of overhead for the Huffman + * codes, so only use them if it clearly seems worthwhile. + * (This is an additional penalty, which adds to the smaller + * penalty below which scales more slowly.) + */ + if (block_length < 10000 && num_items < 8192) + cutoff += (u64)cutoff * (8192 - num_items) / 8192; + + /* Ready to end the block? */ + if (total_delta + + (block_length / 4096) * stats->num_observations >= cutoff) + return true; + } + merge_new_observations(stats); + return false; +} + +static bool +ready_to_check_block(const struct block_split_stats *stats, + const u8 *in_block_begin, const u8 *in_next, + const u8 *in_end) +{ + return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK + && in_next - in_block_begin >= MIN_BLOCK_LENGTH + && in_end - in_next >= MIN_BLOCK_LENGTH; +} + +static bool +should_end_block(struct block_split_stats *stats, + const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) +{ + /* Ready to try to end the block (again)? */ + if (!ready_to_check_block(stats, in_block_begin, in_next, in_end)) + return false; + + return do_end_block_check(stats, in_next - in_block_begin); +} + +/******************************************************************************/ + +static void +deflate_begin_sequences(struct libdeflate_compressor *c, + struct deflate_sequence *first_seq) +{ + deflate_reset_symbol_frequencies(c); + first_seq->litrunlen_and_length = 0; +} + +static void +deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, + bool gather_split_stats, struct deflate_sequence *seq) +{ + c->freqs.litlen[literal]++; + + if (gather_split_stats) + observe_literal(&c->split_stats, literal); + + STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK); + seq->litrunlen_and_length++; +} + +static void +deflate_choose_match(struct libdeflate_compressor *c, + unsigned length, unsigned offset, bool gather_split_stats, + struct deflate_sequence **seq_p) +{ + struct deflate_sequence *seq = *seq_p; + unsigned length_slot = deflate_length_slot[length]; + unsigned offset_slot = deflate_get_offset_slot(offset); + + c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; + c->freqs.offset[offset_slot]++; + if (gather_split_stats) + observe_match(&c->split_stats, length); + + seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT; + seq->offset = offset; + seq->offset_slot = offset_slot; + + seq++; + seq->litrunlen_and_length = 0; + *seq_p = seq; +} + +/* + * Decrease the maximum and nice match lengths if we're approaching the end of + * the input buffer. + */ +static void +adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining) +{ + if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { + *max_len = remaining; + *nice_len = MIN(*nice_len, *max_len); + } +} + +/* + * Choose the minimum match length for the greedy and lazy parsers. + * + * By default the minimum match length is 3, which is the smallest length the + * DEFLATE format allows. However, with greedy and lazy parsing, some data + * (e.g. DNA sequencing data) benefits greatly from a longer minimum length. + * Typically, this is because literals are very cheap. In general, the + * near-optimal parser handles this case naturally, but the greedy and lazy + * parsers need a heuristic to decide when to use short matches. + * + * The heuristic we use is to make the minimum match length depend on the number + * of different literals that exist in the data. If there are many different + * literals, then literals will probably be expensive, so short matches will + * probably be worthwhile. Conversely, if not many literals are used, then + * probably literals will be cheap and short matches won't be worthwhile. + */ +static unsigned +choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth) +{ + /* map from num_used_literals to min_len */ + static const u8 min_lens[] = { + 9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* The rest is implicitly 3. */ + }; + unsigned min_len; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3); + STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1); + + if (num_used_literals >= ARRAY_LEN(min_lens)) + return 3; + min_len = min_lens[num_used_literals]; + /* + * With a low max_search_depth, it may be too hard to find long matches. + */ + if (max_search_depth < 16) { + if (max_search_depth < 5) + min_len = MIN(min_len, 4); + else if (max_search_depth < 10) + min_len = MIN(min_len, 5); + else + min_len = MIN(min_len, 7); + } + return min_len; +} + +static unsigned +calculate_min_match_len(const u8 *data, size_t data_len, + unsigned max_search_depth) +{ + u8 used[256] = { 0 }; + unsigned num_used_literals = 0; + size_t i; + + /* + * For very short inputs, the static Huffman code has a good chance of + * being best, in which case there is no reason to avoid short matches. + */ + if (data_len < 512) + return DEFLATE_MIN_MATCH_LEN; + + /* + * For an initial approximation, scan the first 4 KiB of data. The + * caller may use recalculate_min_match_len() to update min_len later. + */ + data_len = MIN(data_len, 4096); + for (i = 0; i < data_len; i++) + used[data[i]] = 1; + for (i = 0; i < 256; i++) + num_used_literals += used[i]; + return choose_min_match_len(num_used_literals, max_search_depth); +} + +/* + * Recalculate the minimum match length for a block, now that we know the + * distribution of literals that are actually being used (freqs->litlen). + */ +static unsigned +recalculate_min_match_len(const struct deflate_freqs *freqs, + unsigned max_search_depth) +{ + u32 literal_freq = 0; + u32 cutoff; + unsigned num_used_literals = 0; + int i; + + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + literal_freq += freqs->litlen[i]; + + cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */ + + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + if (freqs->litlen[i] > cutoff) + num_used_literals++; + } + return choose_min_match_len(num_used_literals, max_search_depth); +} + +static const u8 * +choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, + size_t soft_max_len) +{ + if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH) + return in_end; + return in_block_begin + soft_max_len; +} + +/* + * This is the level 0 "compressor". It always outputs uncompressed blocks. + */ +static size_t +deflate_compress_none(const u8 *in, size_t in_nbytes, + u8 *out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 * const in_end = in + in_nbytes; + u8 *out_next = out; + u8 * const out_end = out + out_nbytes_avail; + + /* + * If the input is zero-length, we still must output a block in order + * for the output to be a valid DEFLATE stream. Handle this case + * specially to avoid potentially passing NULL to memcpy() below. + */ + if (unlikely(in_nbytes == 0)) { + if (out_nbytes_avail < 5) + return 0; + /* BFINAL and BTYPE */ + *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); + /* LEN and NLEN */ + put_unaligned_le32(0xFFFF0000, out_next); + return 5; + } + + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; + + if (in_end - in_next <= UINT16_MAX) { + bfinal = 1; + len = in_end - in_next; + } + if (out_end - out_next < 5 + len) + return 0; + /* + * Output BFINAL and BTYPE. The stream is already byte-aligned + * here, so this step always requires outputting exactly 1 byte. + */ + *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); + + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + __builtin_memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + + return out_next - out; +} + +/* + * This is a faster variant of deflate_compress_greedy(). It uses the + * ht_matchfinder rather than the hc_matchfinder. It also skips the block + * splitting algorithm and just uses fixed length blocks. c->max_search_depth + * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h. + */ +static void +deflate_compress_fastest(struct libdeflate_compressor * restrict c, + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hash = 0; + + ht_matchfinder_init(&c->p.f.ht_mf); + + do { + /* Starting a new DEFLATE block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = choose_max_block_end( + in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH); + struct deflate_sequence *seq = c->p.f.sequences; + + deflate_begin_sequences(c, seq); + + do { + u32 length; + u32 offset; + size_t remaining = in_end - in_next; + + if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { + max_len = remaining; + if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) { + do { + deflate_choose_literal(c, + *in_next++, false, seq); + } while (--max_len); + break; + } + nice_len = MIN(nice_len, max_len); + } + length = ht_matchfinder_longest_match(&c->p.f.ht_mf, + &in_cur_base, + in_next, + max_len, + nice_len, + &next_hash, + &offset); + if (length) { + /* Match found */ + deflate_choose_match(c, length, offset, false, + &seq); + ht_matchfinder_skip_bytes(&c->p.f.ht_mf, + &in_cur_base, + in_next + 1, + in_end, + length - 1, + &next_hash); + in_next += length; + } else { + /* No match found */ + deflate_choose_literal(c, *in_next++, false, + seq); + } + + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); + + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.f.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); +} + +/* + * This is the "greedy" DEFLATE compressor. It always chooses the longest match. + */ +static void +deflate_compress_greedy(struct libdeflate_compressor * restrict c, + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + hc_matchfinder_init(&c->p.g.hc_mf); + + do { + /* Starting a new DEFLATE block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = choose_max_block_end( + in_next, in_end, SOFT_MAX_BLOCK_LENGTH); + struct deflate_sequence *seq = c->p.g.sequences; + unsigned min_len; + + init_block_split_stats(&c->split_stats); + deflate_begin_sequences(c, seq); + min_len = calculate_min_match_len(in_next, + in_max_block_end - in_next, + c->max_search_depth); + do { + u32 length; + u32 offset; + + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + length = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next, + min_len - 1, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &offset); + + if (length >= min_len && + (length > DEFLATE_MIN_MATCH_LEN || + offset <= 4096)) { + /* Match found */ + deflate_choose_match(c, length, offset, true, + &seq); + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next + 1, + in_end, + length - 1, + next_hashes); + in_next += length; + } else { + /* No match found */ + deflate_choose_literal(c, *in_next++, true, + seq); + } + + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && + !should_end_block(&c->split_stats, + in_block_begin, in_next, in_end)); + + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.g.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); +} + +static void +deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os, bool lazy2) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + hc_matchfinder_init(&c->p.g.hc_mf); + + do { + /* Starting a new DEFLATE block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = choose_max_block_end( + in_next, in_end, SOFT_MAX_BLOCK_LENGTH); + const u8 *next_recalc_min_len = + in_next + MIN(in_end - in_next, 10000); + struct deflate_sequence *seq = c->p.g.sequences; + unsigned min_len; + + init_block_split_stats(&c->split_stats); + deflate_begin_sequences(c, seq); + min_len = calculate_min_match_len(in_next, + in_max_block_end - in_next, + c->max_search_depth); + do { + unsigned cur_len; + unsigned cur_offset; + unsigned next_len; + unsigned next_offset; + + /* + * Recalculate the minimum match length if it hasn't + * been done recently. + */ + if (in_next >= next_recalc_min_len) { + min_len = recalculate_min_match_len( + &c->freqs, + c->max_search_depth); + next_recalc_min_len += + MIN(in_end - next_recalc_min_len, + in_next - in_block_begin); + } + + /* Find the longest match at the current position. */ + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + cur_len = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next, + min_len - 1, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &cur_offset); + if (cur_len < min_len || + (cur_len == DEFLATE_MIN_MATCH_LEN && + cur_offset > 8192)) { + /* No match found. Choose a literal. */ + deflate_choose_literal(c, *in_next++, true, + seq); + continue; + } + in_next++; + +have_cur_match: + /* + * We have a match at the current position. + * If it's very long, choose it immediately. + */ + if (cur_len >= nice_len) { + deflate_choose_match(c, cur_len, cur_offset, + true, &seq); + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 1, + next_hashes); + in_next += cur_len - 1; + continue; + } + + /* + * Try to find a better match at the next position. + * + * Note: since we already have a match at the *current* + * position, we use only half the 'max_search_depth' + * when checking the *next* position. This is a useful + * trade-off because it's more worthwhile to use a + * greater search depth on the initial match. + * + * Note: it's possible to structure the code such that + * there's only one call to longest_match(), which + * handles both the "find the initial match" and "try to + * find a better match" cases. However, it is faster to + * have two call sites, with longest_match() inlined at + * each. + */ + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + next_len = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next++, + cur_len - 1, + max_len, + nice_len, + c->max_search_depth >> 1, + next_hashes, + &next_offset); + if (next_len >= cur_len && + 4 * (int)(next_len - cur_len) + + ((int)bsr32(cur_offset) - + (int)bsr32(next_offset)) > 2) { + /* + * Found a better match at the next position. + * Output a literal. Then the next match + * becomes the current match. + */ + deflate_choose_literal(c, *(in_next - 2), true, + seq); + cur_len = next_len; + cur_offset = next_offset; + goto have_cur_match; + } + + if (lazy2) { + /* In lazy2 mode, look ahead another position */ + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + next_len = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next++, + cur_len - 1, + max_len, + nice_len, + c->max_search_depth >> 2, + next_hashes, + &next_offset); + if (next_len >= cur_len && + 4 * (int)(next_len - cur_len) + + ((int)bsr32(cur_offset) - + (int)bsr32(next_offset)) > 6) { + /* + * There's a much better match two + * positions ahead, so use two literals. + */ + deflate_choose_literal( + c, *(in_next - 3), true, seq); + deflate_choose_literal( + c, *(in_next - 2), true, seq); + cur_len = next_len; + cur_offset = next_offset; + goto have_cur_match; + } + /* + * No better match at either of the next 2 + * positions. Output the current match. + */ + deflate_choose_match(c, cur_len, cur_offset, + true, &seq); + if (cur_len > 3) { + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 3, + next_hashes); + in_next += cur_len - 3; + } + } else { /* !lazy2 */ + /* + * No better match at the next position. Output + * the current match. + */ + deflate_choose_match(c, cur_len, cur_offset, + true, &seq); + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 2, + next_hashes); + in_next += cur_len - 2; + } + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && + !should_end_block(&c->split_stats, + in_block_begin, in_next, in_end)); + + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.g.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); +} + +/* + * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to + * see if there's a better match at the next position. If yes, it outputs a + * literal and continues to the next position. If no, it outputs the match. + */ +static void +deflate_compress_lazy(struct libdeflate_compressor * restrict c, + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) +{ + deflate_compress_lazy_generic(c, in, in_nbytes, os, false); +} + +/* + * The lazy2 compressor. This is similar to the regular lazy one, but it looks + * for a better match at the next 2 positions rather than the next 1. This + * makes it take slightly more time, but compress some inputs slightly more. + */ +static void +deflate_compress_lazy2(struct libdeflate_compressor * restrict c, + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) +{ + deflate_compress_lazy_generic(c, in, in_nbytes, os, true); +} + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and compute the frequencies of the Huffman symbols that + * would be needed to output those matches and literals. + */ +static void +deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) +{ + struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node *end_node = + &c->p.n.optimum_nodes[block_length]; + + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; + + if (length == 1) { + /* Literal */ + c->freqs.litlen[offset]++; + } else { + /* Match */ + c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + + deflate_length_slot[length]]++; + c->freqs.offset[c->p.n.offset_slot_full[offset]]++; + } + cur_node += length; + } while (cur_node != end_node); + + /* Tally the end-of-block symbol. */ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; +} + +static void +deflate_choose_all_literals(struct libdeflate_compressor *c, + const u8 *block, u32 block_length) +{ + u32 i; + + deflate_reset_symbol_frequencies(c); + for (i = 0; i < block_length; i++) + c->freqs.litlen[block[i]]++; + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + + deflate_make_huffman_codes(&c->freqs, &c->codes); +} + +/* + * Compute the exact cost, in bits, that would be required to output the matches + * and literals described by @c->freqs as a dynamic Huffman block. The litlen + * and offset codes are assumed to have already been built in @c->codes. + */ +static u32 +deflate_compute_true_cost(struct libdeflate_compressor *c) +{ + u32 cost = 0; + unsigned sym; + + deflate_precompute_huffman_header(c); + + __builtin_memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, + DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); + + cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + cost += c->o.precode.freqs[sym] * + (c->o.precode.lens[sym] + + deflate_extra_precode_bits[sym]); + } + + for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) + cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; + + for (; sym < DEFLATE_FIRST_LEN_SYM + + ARRAY_LEN(deflate_extra_length_bits); sym++) + cost += c->freqs.litlen[sym] * + (c->codes.lens.litlen[sym] + + deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); + + for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) + cost += c->freqs.offset[sym] * + (c->codes.lens.offset[sym] + + deflate_extra_offset_bits[sym]); + return cost; +} + +/* Set the current cost model from the codeword lengths specified in @lens. */ +static void +deflate_set_costs_from_codes(struct libdeflate_compressor *c, + const struct deflate_lens *lens) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + u32 bits = (lens->litlen[i] ? + lens->litlen[i] : LITERAL_NOSTAT_BITS); + + c->p.n.costs.literal[i] = bits * BIT_COST; + } + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { + unsigned length_slot = deflate_length_slot[i]; + unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot; + u32 bits = (lens->litlen[litlen_sym] ? + lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); + + bits += deflate_extra_length_bits[length_slot]; + c->p.n.costs.length[i] = bits * BIT_COST; + } + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { + u32 bits = (lens->offset[i] ? + lens->offset[i] : OFFSET_NOSTAT_BITS); + + bits += deflate_extra_offset_bits[i]; + c->p.n.costs.offset_slot[i] = bits * BIT_COST; + } +} + +/* + * This lookup table gives the default cost of a literal symbol and of a length + * symbol, depending on the characteristics of the input data. It was generated + * by scripts/gen_default_litlen_costs.py. + * + * This table is indexed first by the estimated match probability: + * + * i=0: data doesn't contain many matches [match_prob=0.25] + * i=1: neutral [match_prob=0.50] + * i=2: data contains lots of matches [match_prob=0.75] + * + * This lookup produces a subtable which maps the number of distinct used + * literals to the default cost of a literal symbol, i.e.: + * + * int(-log2((1 - match_prob) / num_used_literals) * BIT_COST) + * + * ... for num_used_literals in [1, 256] (and 0, which is copied from 1). This + * accounts for literals usually getting cheaper as the number of distinct + * literals decreases, and as the proportion of literals to matches increases. + * + * The lookup also produces the cost of a length symbol, which is: + * + * int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST) + * + * Note: we don't currently assign different costs to different literal symbols, + * or to different length symbols, as this is hard to do in a useful way. + */ +static const struct { + u8 used_lits_to_lit_cost[257]; + u8 len_sym_cost; +} default_litlen_costs[] = { + { /* match_prob = 0.25 */ + .used_lits_to_lit_cost = { + 6, 6, 22, 32, 38, 43, 48, 51, + 54, 57, 59, 61, 64, 65, 67, 69, + 70, 72, 73, 74, 75, 76, 77, 79, + 80, 80, 81, 82, 83, 84, 85, 85, + 86, 87, 88, 88, 89, 89, 90, 91, + 91, 92, 92, 93, 93, 94, 95, 95, + 96, 96, 96, 97, 97, 98, 98, 99, + 99, 99, 100, 100, 101, 101, 101, 102, + 102, 102, 103, 103, 104, 104, 104, 105, + 105, 105, 105, 106, 106, 106, 107, 107, + 107, 108, 108, 108, 108, 109, 109, 109, + 109, 110, 110, 110, 111, 111, 111, 111, + 112, 112, 112, 112, 112, 113, 113, 113, + 113, 114, 114, 114, 114, 114, 115, 115, + 115, 115, 115, 116, 116, 116, 116, 116, + 117, 117, 117, 117, 117, 118, 118, 118, + 118, 118, 118, 119, 119, 119, 119, 119, + 120, 120, 120, 120, 120, 120, 121, 121, + 121, 121, 121, 121, 121, 122, 122, 122, + 122, 122, 122, 123, 123, 123, 123, 123, + 123, 123, 124, 124, 124, 124, 124, 124, + 124, 125, 125, 125, 125, 125, 125, 125, + 125, 126, 126, 126, 126, 126, 126, 126, + 127, 127, 127, 127, 127, 127, 127, 127, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 133, + 133, 133, 133, 133, 133, 133, 133, 133, + 133, 134, 134, 134, 134, 134, 134, 134, + 134, + }, + .len_sym_cost = 109, + }, { /* match_prob = 0.5 */ + .used_lits_to_lit_cost = { + 16, 16, 32, 41, 48, 53, 57, 60, + 64, 66, 69, 71, 73, 75, 76, 78, + 80, 81, 82, 83, 85, 86, 87, 88, + 89, 90, 91, 92, 92, 93, 94, 95, + 96, 96, 97, 98, 98, 99, 99, 100, + 101, 101, 102, 102, 103, 103, 104, 104, + 105, 105, 106, 106, 107, 107, 108, 108, + 108, 109, 109, 110, 110, 110, 111, 111, + 112, 112, 112, 113, 113, 113, 114, 114, + 114, 115, 115, 115, 115, 116, 116, 116, + 117, 117, 117, 118, 118, 118, 118, 119, + 119, 119, 119, 120, 120, 120, 120, 121, + 121, 121, 121, 122, 122, 122, 122, 122, + 123, 123, 123, 123, 124, 124, 124, 124, + 124, 125, 125, 125, 125, 125, 126, 126, + 126, 126, 126, 127, 127, 127, 127, 127, + 128, 128, 128, 128, 128, 128, 129, 129, + 129, 129, 129, 129, 130, 130, 130, 130, + 130, 130, 131, 131, 131, 131, 131, 131, + 131, 132, 132, 132, 132, 132, 132, 133, + 133, 133, 133, 133, 133, 133, 134, 134, + 134, 134, 134, 134, 134, 134, 135, 135, + 135, 135, 135, 135, 135, 135, 136, 136, + 136, 136, 136, 136, 136, 136, 137, 137, + 137, 137, 137, 137, 137, 137, 138, 138, + 138, 138, 138, 138, 138, 138, 138, 139, + 139, 139, 139, 139, 139, 139, 139, 139, + 140, 140, 140, 140, 140, 140, 140, 140, + 140, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 142, 142, 142, 142, 142, + 142, 142, 142, 142, 142, 142, 143, 143, + 143, 143, 143, 143, 143, 143, 143, 143, + 144, + }, + .len_sym_cost = 93, + }, { /* match_prob = 0.75 */ + .used_lits_to_lit_cost = { + 32, 32, 48, 57, 64, 69, 73, 76, + 80, 82, 85, 87, 89, 91, 92, 94, + 96, 97, 98, 99, 101, 102, 103, 104, + 105, 106, 107, 108, 108, 109, 110, 111, + 112, 112, 113, 114, 114, 115, 115, 116, + 117, 117, 118, 118, 119, 119, 120, 120, + 121, 121, 122, 122, 123, 123, 124, 124, + 124, 125, 125, 126, 126, 126, 127, 127, + 128, 128, 128, 129, 129, 129, 130, 130, + 130, 131, 131, 131, 131, 132, 132, 132, + 133, 133, 133, 134, 134, 134, 134, 135, + 135, 135, 135, 136, 136, 136, 136, 137, + 137, 137, 137, 138, 138, 138, 138, 138, + 139, 139, 139, 139, 140, 140, 140, 140, + 140, 141, 141, 141, 141, 141, 142, 142, + 142, 142, 142, 143, 143, 143, 143, 143, + 144, 144, 144, 144, 144, 144, 145, 145, + 145, 145, 145, 145, 146, 146, 146, 146, + 146, 146, 147, 147, 147, 147, 147, 147, + 147, 148, 148, 148, 148, 148, 148, 149, + 149, 149, 149, 149, 149, 149, 150, 150, + 150, 150, 150, 150, 150, 150, 151, 151, + 151, 151, 151, 151, 151, 151, 152, 152, + 152, 152, 152, 152, 152, 152, 153, 153, + 153, 153, 153, 153, 153, 153, 154, 154, + 154, 154, 154, 154, 154, 154, 154, 155, + 155, 155, 155, 155, 155, 155, 155, 155, + 156, 156, 156, 156, 156, 156, 156, 156, + 156, 157, 157, 157, 157, 157, 157, 157, + 157, 157, 157, 158, 158, 158, 158, 158, + 158, 158, 158, 158, 158, 158, 159, 159, + 159, 159, 159, 159, 159, 159, 159, 159, + 160, + }, + .len_sym_cost = 84, + }, +}; + +/* + * Choose the default costs for literal and length symbols. These symbols are + * both part of the litlen alphabet. + */ +static void +deflate_choose_default_litlen_costs(struct libdeflate_compressor *c, + const u8 *block_begin, u32 block_length, + u32 *lit_cost, u32 *len_sym_cost) +{ + unsigned num_used_literals = 0; + u32 literal_freq = block_length; + u32 match_freq = 0; + u32 cutoff; + u32 i; + + /* Calculate the number of distinct literals that exist in the data. */ + __builtin_memset(c->freqs.litlen, 0, + DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0])); + cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */ + for (i = 0; i < block_length; i++) + c->freqs.litlen[block_begin[i]]++; + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + if (c->freqs.litlen[i] > cutoff) + num_used_literals++; + } + if (num_used_literals == 0) + num_used_literals = 1; + + /* + * Estimate the relative frequency of literals and matches in the + * optimal parsing solution. We don't know the optimal solution, so + * this can only be a very rough estimate. Therefore, we basically use + * the match frequency from a greedy parse. We also apply the min_len + * heuristic used by the greedy and lazy parsers, to avoid counting too + * many matches when literals are cheaper than short matches. + */ + match_freq = 0; + i = choose_min_match_len(num_used_literals, c->max_search_depth); + for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { + match_freq += c->p.n.match_len_freqs[i]; + literal_freq -= i * c->p.n.match_len_freqs[i]; + } + if ((s32)literal_freq < 0) /* shouldn't happen */ + literal_freq = 0; + + if (match_freq > literal_freq) + i = 2; /* many matches */ + else if (match_freq * 4 > literal_freq) + i = 1; /* neutral */ + else + i = 0; /* few matches */ + + STATIC_ASSERT(BIT_COST == 16); + *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[ + num_used_literals]; + *len_sym_cost = default_litlen_costs[i].len_sym_cost; +} + +static u32 +deflate_default_length_cost(unsigned len, u32 len_sym_cost) +{ + unsigned slot = deflate_length_slot[len]; + u32 num_extra_bits = deflate_extra_length_bits[slot]; + + return len_sym_cost + (num_extra_bits * BIT_COST); +} + +static u32 +deflate_default_offset_slot_cost(unsigned slot) +{ + u32 num_extra_bits = deflate_extra_offset_bits[slot]; + /* + * Assume that all offset symbols are equally probable. + * The resulting cost is 'int(-log2(1/30) * BIT_COST)', + * where 30 is the number of potentially-used offset symbols. + */ + u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000; + + return offset_sym_cost + (num_extra_bits * BIT_COST); +} + +/* Set default symbol costs for the first block's first optimization pass. */ +static void +deflate_set_default_costs(struct libdeflate_compressor *c, + u32 lit_cost, u32 len_sym_cost) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + c->p.n.costs.literal[i] = lit_cost; + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + c->p.n.costs.length[i] = + deflate_default_length_cost(i, len_sym_cost); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + c->p.n.costs.offset_slot[i] = + deflate_default_offset_slot_cost(i); +} + +static void +deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount) +{ + if (change_amount == 0) + /* Block is very similar to previous; prefer previous costs. */ + *cost_p = (default_cost + 3 * *cost_p) / 4; + else if (change_amount == 1) + *cost_p = (default_cost + *cost_p) / 2; + else if (change_amount == 2) + *cost_p = (5 * default_cost + 3 * *cost_p) / 8; + else + /* Block differs greatly from previous; prefer default costs. */ + *cost_p = (3 * default_cost + *cost_p) / 4; +} + +static void +deflate_adjust_costs_impl(struct libdeflate_compressor *c, + u32 lit_cost, u32 len_sym_cost, int change_amount) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost, + change_amount); + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + deflate_adjust_cost(&c->p.n.costs.length[i], + deflate_default_length_cost(i, + len_sym_cost), + change_amount); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + deflate_adjust_cost(&c->p.n.costs.offset_slot[i], + deflate_default_offset_slot_cost(i), + change_amount); +} + +/* + * Adjust the costs when beginning a new block. + * + * Since the current costs are optimized for the data already, it can be helpful + * to reuse them instead of starting over with the default costs. However, this + * depends on how similar the new block is to the previous block. Therefore, + * use a heuristic to decide how similar the blocks are, and mix together the + * current costs and the default costs accordingly. + */ +static void +deflate_adjust_costs(struct libdeflate_compressor *c, + u32 lit_cost, u32 len_sym_cost) +{ + u64 total_delta = 0; + u64 cutoff; + int i; + + /* + * Decide how different the current block is from the previous block, + * using the block splitting statistics from the current and previous + * blocks. The more different the current block is, the more we prefer + * the default costs rather than the previous block's costs. + * + * The algorithm here is similar to the end-of-block check one, but here + * we compare two entire blocks rather than a partial block with a small + * extra part, and therefore we need 64-bit numbers in some places. + */ + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + u64 prev = (u64)c->p.n.prev_observations[i] * + c->split_stats.num_observations; + u64 cur = (u64)c->split_stats.observations[i] * + c->p.n.prev_num_observations; + + total_delta += prev > cur ? prev - cur : cur - prev; + } + cutoff = ((u64)c->p.n.prev_num_observations * + c->split_stats.num_observations * 200) / 512; + + if (total_delta > 3 * cutoff) + /* Big change in the data; just use the default costs. */ + deflate_set_default_costs(c, lit_cost, len_sym_cost); + else if (4 * total_delta > 9 * cutoff) + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); + else if (2 * total_delta > 3 * cutoff) + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); + else if (2 * total_delta > cutoff) + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1); + else + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); +} + +static void +deflate_set_initial_costs(struct libdeflate_compressor *c, + const u8 *block_begin, u32 block_length, + bool is_first_block) +{ + u32 lit_cost, len_sym_cost; + + deflate_choose_default_litlen_costs(c, block_begin, block_length, + &lit_cost, &len_sym_cost); + if (is_first_block) + deflate_set_default_costs(c, lit_cost, len_sym_cost); + else + deflate_adjust_costs(c, lit_cost, len_sym_cost); +} + +/* + * Find the minimum-cost path through the graph of possible match/literal + * choices for this block. + * + * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which + * represents the node at the beginning of the block, to + * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of + * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. + * + * The algorithm works backwards, starting at the end node and proceeding + * backwards one node at a time. At each node, the minimum cost to reach the + * end node is computed and the match/literal choice that begins that path is + * saved. + */ +static void +deflate_find_min_cost_path(struct libdeflate_compressor *c, + const u32 block_length, + const struct lz_match *cache_ptr) +{ + struct deflate_optimum_node *end_node = + &c->p.n.optimum_nodes[block_length]; + struct deflate_optimum_node *cur_node = end_node; + + cur_node->cost_to_end = 0; + do { + unsigned num_matches; + unsigned literal; + u32 best_cost_to_end; + + cur_node--; + cache_ptr--; + + num_matches = cache_ptr->length; + literal = cache_ptr->offset; + + /* It's always possible to choose a literal. */ + best_cost_to_end = c->p.n.costs.literal[literal] + + (cur_node + 1)->cost_to_end; + cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; + + /* Also consider matches if there are any. */ + if (num_matches) { + const struct lz_match *match; + unsigned len; + unsigned offset; + unsigned offset_slot; + u32 offset_cost; + u32 cost_to_end; + + /* + * Consider each length from the minimum + * (DEFLATE_MIN_MATCH_LEN) to the length of the longest + * match found at this position. For each length, we + * consider only the smallest offset for which that + * length is available. Although this is not guaranteed + * to be optimal due to the possibility of a larger + * offset costing less than a smaller offset to code, + * this is a very useful heuristic. + */ + match = cache_ptr - num_matches; + len = DEFLATE_MIN_MATCH_LEN; + do { + offset = match->offset; + offset_slot = c->p.n.offset_slot_full[offset]; + offset_cost = + c->p.n.costs.offset_slot[offset_slot]; + do { + cost_to_end = offset_cost + + c->p.n.costs.length[len] + + (cur_node + len)->cost_to_end; + if (cost_to_end < best_cost_to_end) { + best_cost_to_end = cost_to_end; + cur_node->item = len | + ((u32)offset << + OPTIMUM_OFFSET_SHIFT); + } + } while (++len <= match->length); + } while (++match != cache_ptr); + cache_ptr -= num_matches; + } + cur_node->cost_to_end = best_cost_to_end; + } while (cur_node != &c->p.n.optimum_nodes[0]); + + deflate_reset_symbol_frequencies(c); + deflate_tally_item_list(c, block_length); + deflate_make_huffman_codes(&c->freqs, &c->codes); +} + +/* + * Choose the literals and matches for the current block, then output the block. + * + * To choose the literal/match sequence, we find the minimum-cost path through + * the block's graph of literal/match choices, given a cost model. However, the + * true cost of each symbol is unknown until the Huffman codes have been built, + * but at the same time the Huffman codes depend on the frequencies of chosen + * symbols. Consequently, multiple passes must be used to try to approximate an + * optimal solution. The first pass uses default costs, mixed with the costs + * from the previous block when it seems appropriate. Later passes use the + * Huffman codeword lengths from the previous pass as the costs. + * + * As an alternate strategy, also consider using only literals. The boolean + * returned in *used_only_literals indicates whether that strategy was best. + */ +static void +deflate_optimize_and_flush_block(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct lz_match *cache_ptr, + bool is_first_block, bool is_final_block, + bool *used_only_literals) +{ + unsigned num_passes_remaining = c->p.n.max_optim_passes; + u32 best_true_cost = UINT32_MAX; + u32 true_cost; + u32 only_lits_cost; + u32 static_cost = UINT32_MAX; + struct deflate_sequence seq_; + struct deflate_sequence *seq = NULL; + u32 i; + + /* + * On some data, using only literals (no matches) ends up being better + * than what the iterative optimization algorithm produces. Therefore, + * consider using only literals. + */ + deflate_choose_all_literals(c, block_begin, block_length); + only_lits_cost = deflate_compute_true_cost(c); + + /* + * Force the block to really end at the desired length, even if some + * matches extend beyond it. + */ + for (i = block_length; + i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, + ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) + c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; + + /* + * Sometimes a static Huffman block ends up being cheapest, particularly + * if the block is small. So, if the block is sufficiently small, find + * the optimal static block solution and remember its cost. + */ + if (block_length <= c->p.n.max_len_to_optimize_static_block) { + /* Save c->p.n.costs temporarily. */ + c->p.n.costs_saved = c->p.n.costs; + + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; + static_cost += 7; /* for the end-of-block symbol */ + + /* Restore c->p.n.costs. */ + c->p.n.costs = c->p.n.costs_saved; + } + + /* Initialize c->p.n.costs with default costs. */ + deflate_set_initial_costs(c, block_begin, block_length, is_first_block); + + do { + /* + * Find the minimum-cost path for this pass. + * Also set c->freqs and c->codes to match the path. + */ + deflate_find_min_cost_path(c, block_length, cache_ptr); + + /* + * Compute the exact cost of the block if the path were to be + * used. Note that this differs from + * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses + * the actual Huffman codes instead of c->p.n.costs. + */ + true_cost = deflate_compute_true_cost(c); + + /* + * If the cost didn't improve much from the previous pass, then + * doing more passes probably won't be helpful, so stop early. + */ + if (true_cost + c->p.n.min_improvement_to_continue > + best_true_cost) + break; + + best_true_cost = true_cost; + + /* Save the cost model that gave 'best_true_cost'. */ + c->p.n.costs_saved = c->p.n.costs; + + /* Update the cost model from the Huffman codes. */ + deflate_set_costs_from_codes(c, &c->codes.lens); + + } while (--num_passes_remaining); + + *used_only_literals = false; + if (MIN(only_lits_cost, static_cost) < best_true_cost) { + if (only_lits_cost < static_cost) { + /* Using only literals ended up being best! */ + deflate_choose_all_literals(c, block_begin, block_length); + deflate_set_costs_from_codes(c, &c->codes.lens); + seq_.litrunlen_and_length = block_length; + seq = &seq_; + *used_only_literals = true; + } else { + /* Static block ended up being best! */ + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + } + } else if (true_cost >= + best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { + /* + * The best solution was actually from a non-final optimization + * pass, so recover and use the min-cost path from that pass. + */ + c->p.n.costs = c->p.n.costs_saved; + deflate_find_min_cost_path(c, block_length, cache_ptr); + deflate_set_costs_from_codes(c, &c->codes.lens); + } + deflate_flush_block(c, os, block_begin, block_length, seq, + is_final_block); +} + +static void +deflate_near_optimal_init_stats(struct libdeflate_compressor *c) +{ + init_block_split_stats(&c->split_stats); + __builtin_memset(c->p.n.new_match_len_freqs, 0, + sizeof(c->p.n.new_match_len_freqs)); + __builtin_memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); +} + +static void +deflate_near_optimal_merge_stats(struct libdeflate_compressor *c) +{ + unsigned i; + + merge_new_observations(&c->split_stats); + for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { + c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i]; + c->p.n.new_match_len_freqs[i] = 0; + } +} + +/* + * Save some literal/match statistics from the previous block so that + * deflate_adjust_costs() will be able to decide how much the current block + * differs from the previous one. + */ +static void +deflate_near_optimal_save_stats(struct libdeflate_compressor *c) +{ + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) + c->p.n.prev_observations[i] = c->split_stats.observations[i]; + c->p.n.prev_num_observations = c->split_stats.num_observations; +} + +static void +deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) +{ + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) + c->split_stats.observations[i] = 0; + c->split_stats.num_observations = 0; + __builtin_memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); +} + +/* + * This is the "near-optimal" DEFLATE compressor. It computes the optimal + * representation of each DEFLATE block using a minimum-cost path search over + * the graph of possible match/literal choices for that block, assuming a + * certain cost for each Huffman symbol. + * + * For several reasons, the end result is not guaranteed to be optimal: + * + * - Nonoptimal choice of blocks + * - Heuristic limitations on which matches are actually considered + * - Symbol costs are unknown until the symbols have already been chosen + * (so iterative optimization must be used) + */ +static void +deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) +{ + const u8 *in_next = in; + const u8 *in_block_begin = in_next; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + const u8 *in_next_slide = + in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + struct lz_match *cache_ptr = c->p.n.match_cache; + u32 next_hashes[2] = {0, 0}; + bool prev_block_used_only_literals = false; + + bt_matchfinder_init(&c->p.n.bt_mf); + deflate_near_optimal_init_stats(c); + + do { + /* Starting a new DEFLATE block */ + const u8 * const in_max_block_end = choose_max_block_end( + in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH); + const u8 *prev_end_block_check = NULL; + bool change_detected = false; + const u8 *next_observation = in_next; + unsigned min_len; + + /* + * Use the minimum match length heuristic to improve the + * literal/match statistics gathered during matchfinding. + * However, the actual near-optimal parse won't respect min_len, + * as it can accurately assess the costs of different matches. + * + * If the "use only literals" strategy happened to be the best + * strategy on the previous block, then probably the + * min_match_len heuristic is still not aggressive enough for + * the data, so force gathering literal stats only. + */ + if (prev_block_used_only_literals) + min_len = DEFLATE_MAX_MATCH_LEN + 1; + else + min_len = calculate_min_match_len( + in_block_begin, + in_max_block_end - in_block_begin, + c->max_search_depth); + + /* + * Find matches until we decide to end the block. We end the + * block if any of the following is true: + * + * (1) Maximum block length has been reached + * (2) Match catch may overflow. + * (3) Block split heuristic says to split now. + */ + for (;;) { + struct lz_match *matches; + unsigned best_len; + size_t remaining = in_end - in_next; + + /* Slide the window forward if needed. */ + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->p.n.bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + + MIN(remaining, MATCHFINDER_WINDOW_SIZE); + } + + /* + * Find matches with the current position using the + * binary tree matchfinder and save them in match_cache. + * + * Note: the binary tree matchfinder is more suited for + * optimal parsing than the hash chain matchfinder. The + * reasons for this include: + * + * - The binary tree matchfinder can find more matches + * in the same number of steps. + * - One of the major advantages of hash chains is that + * skipping positions (not searching for matches at + * them) is faster; however, with optimal parsing we + * search for matches at almost all positions, so this + * advantage of hash chains is negated. + */ + matches = cache_ptr; + best_len = 0; + adjust_max_and_nice_len(&max_len, &nice_len, remaining); + if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { + cache_ptr = bt_matchfinder_get_matches( + &c->p.n.bt_mf, + in_cur_base, + in_next - in_cur_base, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + matches); + if (cache_ptr > matches) + best_len = cache_ptr[-1].length; + } + if (in_next >= next_observation) { + if (best_len >= min_len) { + observe_match(&c->split_stats, + best_len); + next_observation = in_next + best_len; + c->p.n.new_match_len_freqs[best_len]++; + } else { + observe_literal(&c->split_stats, + *in_next); + next_observation = in_next + 1; + } + } + + cache_ptr->length = cache_ptr - matches; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + + /* + * If there was a very long match found, don't cache any + * matches for the bytes covered by that match. This + * avoids degenerate behavior when compressing highly + * redundant data, where the number of matches can be + * very large. + * + * This heuristic doesn't actually hurt the compression + * ratio very much. If there's a long match, then the + * data must be highly compressible, so it doesn't + * matter much what we do. + */ + if (best_len >= DEFLATE_MIN_MATCH_LEN && + best_len >= nice_len) { + --best_len; + do { + remaining = in_end - in_next; + if (in_next == in_next_slide) { + bt_matchfinder_slide_window( + &c->p.n.bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + + MIN(remaining, + MATCHFINDER_WINDOW_SIZE); + } + adjust_max_and_nice_len(&max_len, + &nice_len, + remaining); + if (max_len >= + BT_MATCHFINDER_REQUIRED_NBYTES) { + bt_matchfinder_skip_byte( + &c->p.n.bt_mf, + in_cur_base, + in_next - in_cur_base, + nice_len, + c->max_search_depth, + next_hashes); + } + cache_ptr->length = 0; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + } while (--best_len); + } + /* Maximum block length or end of input reached? */ + if (in_next >= in_max_block_end) + break; + /* Match cache overflowed? */ + if (cache_ptr >= + &c->p.n.match_cache[MATCH_CACHE_LENGTH]) + break; + /* Not ready to try to end the block (again)? */ + if (!ready_to_check_block(&c->split_stats, + in_block_begin, in_next, + in_end)) + continue; + /* Check if it would be worthwhile to end the block. */ + if (do_end_block_check(&c->split_stats, + in_next - in_block_begin)) { + change_detected = true; + break; + } + /* Ending the block doesn't seem worthwhile here. */ + deflate_near_optimal_merge_stats(c); + prev_end_block_check = in_next; + } + /* + * All the matches for this block have been cached. Now choose + * the precise end of the block and the sequence of items to + * output to represent it, then flush the block. + */ + if (change_detected && prev_end_block_check != NULL) { + /* + * The block is being ended because a recent chunk of + * data differs from the rest of the block. We could + * end the block at 'in_next' like the greedy and lazy + * compressors do, but that's not ideal since it would + * include the differing chunk in the block. The + * near-optimal compressor has time to do a better job. + * Therefore, we rewind to just before the chunk, and + * output a block that only goes up to there. + * + * We then set things up to correctly start the next + * block, considering that some work has already been + * done on it (some matches found and stats gathered). + */ + struct lz_match *orig_cache_ptr = cache_ptr; + const u8 *in_block_end = prev_end_block_check; + u32 block_length = in_block_end - in_block_begin; + bool is_first = (in_block_begin == in); + bool is_final = false; + u32 num_bytes_to_rewind = in_next - in_block_end; + size_t cache_len_rewound; + + /* Rewind the match cache. */ + do { + cache_ptr--; + cache_ptr -= cache_ptr->length; + } while (--num_bytes_to_rewind); + cache_len_rewound = orig_cache_ptr - cache_ptr; + + deflate_optimize_and_flush_block( + c, os, in_block_begin, + block_length, cache_ptr, + is_first, is_final, + &prev_block_used_only_literals); + __builtin_memmove(c->p.n.match_cache, cache_ptr, + cache_len_rewound * sizeof(*cache_ptr)); + cache_ptr = &c->p.n.match_cache[cache_len_rewound]; + deflate_near_optimal_save_stats(c); + /* + * Clear the stats for the just-flushed block, leaving + * just the stats for the beginning of the next block. + */ + deflate_near_optimal_clear_old_stats(c); + in_block_begin = in_block_end; + } else { + /* + * The block is being ended for a reason other than a + * differing data chunk being detected. Don't rewind at + * all; just end the block at the current position. + */ + u32 block_length = in_next - in_block_begin; + bool is_first = (in_block_begin == in); + bool is_final = (in_next == in_end); + + deflate_near_optimal_merge_stats(c); + deflate_optimize_and_flush_block( + c, os, in_block_begin, + block_length, cache_ptr, + is_first, is_final, + &prev_block_used_only_literals); + cache_ptr = &c->p.n.match_cache[0]; + deflate_near_optimal_save_stats(c); + deflate_near_optimal_init_stats(c); + in_block_begin = in_next; + } + } while (in_next != in_end && !os->overflow); +} + +/* Initialize c->p.n.offset_slot_full. */ +static void +deflate_init_offset_slot_full(struct libdeflate_compressor *c) +{ + unsigned offset_slot; + unsigned offset; + unsigned offset_end; + + for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base); + offset_slot++) { + offset = deflate_offset_slot_base[offset_slot]; + offset_end = offset + + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->p.n.offset_slot_full[offset] = offset_slot; + } while (++offset != offset_end); + } +} + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +struct libdeflate_compressor * +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options) +{ + struct libdeflate_compressor *c; + size_t size = offsetof(struct libdeflate_compressor, p); + + check_buildtime_parameters(); + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + if (compression_level < 0 || compression_level > 12) + return NULL; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (compression_level >= 10) + size += sizeof(c->p.n); + else +#endif + { + if (compression_level >= 2) + size += sizeof(c->p.g); + else if (compression_level == 1) + size += sizeof(c->p.f); + } + + c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); + if (!c) + return NULL; + + c->compression_level = compression_level; + + /* + * The higher the compression level, the more we should bother trying to + * compress very small inputs. + */ + c->max_passthrough_size = 55 - (compression_level * 4); + + switch (compression_level) { + case 0: + c->max_passthrough_size = SIZE_MAX; + c->impl = NULL; /* not used */ + break; + case 1: + c->impl = deflate_compress_fastest; + /* max_search_depth is unused. */ + c->nice_match_length = 32; + break; + case 2: + c->impl = deflate_compress_greedy; + c->max_search_depth = 6; + c->nice_match_length = 10; + break; + case 3: + c->impl = deflate_compress_greedy; + c->max_search_depth = 12; + c->nice_match_length = 14; + break; + case 4: + c->impl = deflate_compress_greedy; + c->max_search_depth = 16; + c->nice_match_length = 30; + break; + case 5: + c->impl = deflate_compress_lazy; + c->max_search_depth = 16; + c->nice_match_length = 30; + break; + case 6: + c->impl = deflate_compress_lazy; + c->max_search_depth = 35; + c->nice_match_length = 65; + break; + case 7: + c->impl = deflate_compress_lazy; + c->max_search_depth = 100; + c->nice_match_length = 130; + break; + case 8: + c->impl = deflate_compress_lazy2; + c->max_search_depth = 300; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; + case 9: +#if !SUPPORT_NEAR_OPTIMAL_PARSING + default: +#endif + c->impl = deflate_compress_lazy2; + c->max_search_depth = 600; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; +#if SUPPORT_NEAR_OPTIMAL_PARSING + case 10: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 35; + c->nice_match_length = 75; + c->p.n.max_optim_passes = 2; + c->p.n.min_improvement_to_continue = 32; + c->p.n.min_bits_to_use_nonfinal_path = 32; + c->p.n.max_len_to_optimize_static_block = 0; + deflate_init_offset_slot_full(c); + break; + case 11: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 100; + c->nice_match_length = 150; + c->p.n.max_optim_passes = 4; + c->p.n.min_improvement_to_continue = 16; + c->p.n.min_bits_to_use_nonfinal_path = 16; + c->p.n.max_len_to_optimize_static_block = 1000; + deflate_init_offset_slot_full(c); + break; + case 12: + default: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 300; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + c->p.n.max_optim_passes = 10; + c->p.n.min_improvement_to_continue = 1; + c->p.n.min_bits_to_use_nonfinal_path = 1; + c->p.n.max_len_to_optimize_static_block = 10000; + deflate_init_offset_slot_full(c); + break; +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + } + + deflate_init_static_codes(c); + + return c; +} + + +struct libdeflate_compressor * +libdeflate_alloc_compressor(int compression_level) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_compressor_ex(compression_level, &defaults); +} + +size_t +libdeflate_deflate_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + struct deflate_output_bitstream os; + + /* + * For extremely short inputs, or for compression level 0, just output + * uncompressed blocks. + */ + if (unlikely(in_nbytes <= c->max_passthrough_size)) + return deflate_compress_none(in, in_nbytes, + out, out_nbytes_avail); + + /* Initialize the output bitstream structure. */ + os.bitbuf = 0; + os.bitcount = 0; + os.next = out; + os.end = os.next + out_nbytes_avail; + os.overflow = false; + + /* Call the actual compression function. */ + (*c->impl)(c, in, in_nbytes, &os); + + /* Return 0 if the output buffer is too small. */ + if (os.overflow) + return 0; + + /* + * Write the final byte if needed. This can't overflow the output + * buffer because deflate_flush_block() would have set the overflow flag + * if there wasn't enough space remaining for the full final block. + */ + ASSERT(os.bitcount <= 7); + if (os.bitcount) { + ASSERT(os.next < os.end); + *os.next++ = os.bitbuf; + } + + /* Return the compressed size in bytes. */ + return os.next - (u8 *)out; +} + +void +libdeflate_free_compressor(struct libdeflate_compressor *c) +{ + if (c) + libdeflate_aligned_free(c); +} + +unsigned int +libdeflate_get_compression_level(struct libdeflate_compressor *c) +{ + return c->compression_level; +} + +size_t +libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, + size_t in_nbytes) +{ + size_t max_blocks; + + /* + * Since the compressor never uses a compressed block when an + * uncompressed block is cheaper, the worst case can be no worse than + * the case where only uncompressed blocks are used. + * + * This is true even though up to 7 bits are "wasted" to byte-align the + * bitstream when a compressed block is followed by an uncompressed + * block. This is because a compressed block wouldn't have been used if + * it wasn't cheaper than an uncompressed block, and uncompressed blocks + * always end on a byte boundary. So the alignment bits will, at worst, + * go up to the place where the uncompressed block would have ended. + */ + + /* + * Calculate the maximum number of uncompressed blocks that the + * compressor can use for 'in_nbytes' of data. + * + * The minimum length that is passed to deflate_flush_block() is + * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If + * deflate_flush_block() decides to use an uncompressed block, it + * actually will (in general) output a series of uncompressed blocks in + * order to stay within the UINT16_MAX limit of DEFLATE. But this can + * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', + * as in that case this behavior can't result in more blocks than the + * case where deflate_flush_block() is called with min-length inputs. + * + * So the number of uncompressed blocks needed would be bounded by + * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs + * need 1 (empty) block, which gives the final expression below. + */ + STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); + max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); + + /* + * Each uncompressed block has 5 bytes of overhead, for the BFINAL, + * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the + * alignment bits at the very start of the block can be disregarded; + * they would otherwise increase the overhead to 6 bytes per block.) + * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. + * To get the final bound, add the number of uncompressed bytes. + */ + return (5 * max_blocks) + in_nbytes; +} diff --git a/packages/wasm/lib/libdeflate/deflate_compress.h b/packages/wasm/lib/libdeflate/deflate_compress.h new file mode 100644 index 00000000..bd7a89f9 --- /dev/null +++ b/packages/wasm/lib/libdeflate/deflate_compress.h @@ -0,0 +1,20 @@ +#ifndef LIB_DEFLATE_COMPRESS_H +#define LIB_DEFLATE_COMPRESS_H + +#include "lib_common.h" + +/* + * DEFLATE compression is private to deflate_compress.c, but we do need to be + * able to query the compression level for zlib and gzip header generation. + */ + +struct libdeflate_compressor; + +unsigned int libdeflate_get_compression_level(struct libdeflate_compressor *c); +size_t libdeflate_deflate_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +size_t libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes); + +#endif /* LIB_DEFLATE_COMPRESS_H */ diff --git a/packages/wasm/lib/libdeflate/deflate_constants.h b/packages/wasm/lib/libdeflate/deflate_constants.h new file mode 100644 index 00000000..95c9e0a5 --- /dev/null +++ b/packages/wasm/lib/libdeflate/deflate_constants.h @@ -0,0 +1,56 @@ +/* + * deflate_constants.h - constants for the DEFLATE compression format + */ + +#ifndef LIB_DEFLATE_CONSTANTS_H +#define LIB_DEFLATE_CONSTANTS_H + +/* Valid block types */ +#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 +#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 +#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 + +/* Minimum and maximum supported match lengths (in bytes) */ +#define DEFLATE_MIN_MATCH_LEN 3 +#define DEFLATE_MAX_MATCH_LEN 258 + +/* Maximum supported match offset (in bytes) */ +#define DEFLATE_MAX_MATCH_OFFSET 32768 + +/* log2 of DEFLATE_MAX_MATCH_OFFSET */ +#define DEFLATE_WINDOW_ORDER 15 + +/* Number of symbols in each Huffman code. Note: for the literal/length + * and offset codes, these are actually the maximum values; a given block + * might use fewer symbols. */ +#define DEFLATE_NUM_PRECODE_SYMS 19 +#define DEFLATE_NUM_LITLEN_SYMS 288 +#define DEFLATE_NUM_OFFSET_SYMS 32 + +/* The maximum number of symbols across all codes */ +#define DEFLATE_MAX_NUM_SYMS 288 + +/* Division of symbols in the literal/length code */ +#define DEFLATE_NUM_LITERALS 256 +#define DEFLATE_END_OF_BLOCK 256 +#define DEFLATE_FIRST_LEN_SYM 257 + +/* Maximum codeword length, in bits, within each Huffman code */ +#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 +#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 +#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 + +/* The maximum codeword length across all codes */ +#define DEFLATE_MAX_CODEWORD_LEN 15 + +/* Maximum possible overrun when decoding codeword lengths */ +#define DEFLATE_MAX_LENS_OVERRUN 137 + +/* + * Maximum number of extra bits that may be required to represent a match + * length or offset. + */ +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 13 + +#endif /* LIB_DEFLATE_CONSTANTS_H */ diff --git a/packages/wasm/lib/libdeflate/deflate_decompress.c b/packages/wasm/lib/libdeflate/deflate_decompress.c new file mode 100644 index 00000000..07aaa442 --- /dev/null +++ b/packages/wasm/lib/libdeflate/deflate_decompress.c @@ -0,0 +1,1200 @@ +/* + * deflate_decompress.c - a decompressor for DEFLATE + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * This is a highly optimized DEFLATE decompressor. It is much faster than + * vanilla zlib, typically well over twice as fast, though results vary by CPU. + * + * Why this is faster than vanilla zlib: + * + * - Word accesses rather than byte accesses when reading input + * - Word accesses rather than byte accesses when copying matches + * - Faster Huffman decoding combined with various DEFLATE-specific tricks + * - Larger bitbuffer variable that doesn't need to be refilled as often + * - Other optimizations to remove unnecessary branches + * - Only full-buffer decompression is supported, so the code doesn't need to + * support stopping and resuming decompression. + * - On x86_64, a version of the decompression routine is compiled with BMI2 + * instructions enabled and is used automatically at runtime when supported. + */ + +#include "lib_common.h" +#include "deflate_constants.h" + +/* + * If the expression passed to SAFETY_CHECK() evaluates to false, then the + * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the + * compressed data is invalid. + * + * Theoretically, these checks could be disabled for specialized applications + * where all input to the decompressor will be trusted. + */ +#if 0 +# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") +# define SAFETY_CHECK(expr) (void)(expr) +#else +# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA +#endif + +/***************************************************************************** + * Input bitstream * + *****************************************************************************/ + +/* + * The state of the "input bitstream" consists of the following variables: + * + * - in_next: a pointer to the next unread byte in the input buffer + * + * - in_end: a pointer to just past the end of the input buffer + * + * - bitbuf: a word-sized variable containing bits that have been read from + * the input buffer or from the implicit appended zero bytes + * + * - bitsleft: the number of bits in 'bitbuf' available to be consumed. + * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually + * contain more bits than this. However, only the bits counted + * by 'bitsleft' can actually be consumed; the rest can only be + * used for preloading. + * + * As a micro-optimization, we allow bits 8 and higher of + * 'bitsleft' to contain garbage. When consuming the bits + * associated with a decode table entry, this allows us to do + * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'. + * On some CPUs, this helps reduce instruction dependencies. + * This does have the disadvantage that 'bitsleft' sometimes + * needs to be cast to 'u8', such as when it's used as a shift + * amount in REFILL_BITS_BRANCHLESS(). But that one happens + * for free since most CPUs ignore high bits in shift amounts. + * + * - overread_count: the total number of implicit appended zero bytes that + * have been loaded into the bitbuffer, including any + * counted by 'bitsleft' and any already consumed + */ + +/* + * The type for the bitbuffer variable ('bitbuf' described above). For best + * performance, this should have size equal to a machine word. + * + * 64-bit platforms have a significant advantage: they get a bigger bitbuffer + * which they don't have to refill as often. + */ +typedef machine_word_t bitbuf_t; +#define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t)) + +/* BITMASK(n) returns a bitmask of length 'n'. */ +#define BITMASK(n) (((bitbuf_t)1 << (n)) - 1) + +/* + * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value + * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if + * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()). + */ +#define MAX_BITSLEFT \ + (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS) + +/* + * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be + * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer. + * Since only whole bytes can be added to 'bitsleft', the worst case is + * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit. + */ +#define CONSUMABLE_NBITS (MAX_BITSLEFT - 7) + +/* + * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed + * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP(). (It is *not* + * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a + * byte-at-a-time refill method near the end of input.) This may exceed the + * number of consumable bits (counted by 'bitsleft'). Any bits not counted in + * 'bitsleft' can only be used for precomputation and cannot be consumed. + */ +#define FASTLOOP_PRELOADABLE_NBITS \ + (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS) + +/* + * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be + * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any + * subsequent consumptions. This is 1 bit if the branchless refill method is + * being used, and 0 bits otherwise. + */ +#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT) + +/* + * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been + * refilled, then it's always possible to consume 'n' bits from it. 'n' should + * be a compile-time constant, to enable compile-time evaluation. + */ +#define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n)) + +/* + * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's + * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to + * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The + * arguments should be compile-time constants to enable compile-time evaluation. + */ +#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \ + (CONSUMABLE_NBITS >= (consume_nbits) && \ + FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits)) + +/* + * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by + * reading the next word from the input buffer and updating 'in_next' and + * 'bitsleft' based on how many bits were refilled -- counting whole bytes only. + * This is much faster than reading a byte at a time, at least if the CPU is + * little endian and supports fast unaligned memory accesses. + * + * The simplest way of branchlessly updating 'bitsleft' would be: + * + * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7; + * + * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than + * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update + * 'bitsleft' by just setting the bits above the low 3 bits: + * + * bitsleft |= MAX_BITSLEFT & ~7; + * + * That compiles down to a single instruction like 'or $0x38, %rbp'. Using + * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be + * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior. + * + * The simplest way of branchlessly updating 'in_next' would be: + * + * in_next += (MAX_BITSLEFT - bitsleft) >> 3; + * + * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this + * isn't really better: + * + * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3; + * + * An alternative which can be marginally better is the following: + * + * in_next += sizeof(bitbuf_t) - 1; + * in_next -= (bitsleft >> 3) & 0x7; + * + * It seems this would increase the number of CPU instructions from 3 (sub, shr, + * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield + * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially + * more efficient because the length of the longest dependency chain decreases + * from 3 to 2. This alternative also has the advantage that it ignores the + * high bits in 'bitsleft', so it is compatible with the micro-optimization we + * use where we let the high bits of 'bitsleft' contain garbage. + */ +#define REFILL_BITS_BRANCHLESS() \ +do { \ + bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \ + in_next += sizeof(bitbuf_t) - 1; \ + in_next -= (bitsleft >> 3) & 0x7; \ + bitsleft |= MAX_BITSLEFT & ~7; \ +} while (0) + +/* + * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable + * contains at least CONSUMABLE_NBITS consumable bits. + * + * This checks for the end of input, and it doesn't guarantee + * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop. + * + * If we would overread the input buffer, we just don't read anything, leaving + * the bits zeroed but marking them filled. This simplifies the decompressor + * because it removes the need to always be able to distinguish between real + * overreads and overreads caused only by the decompressor's own lookahead. + * + * We do still keep track of the number of bytes that have been overread, for + * two reasons. First, it allows us to determine the exact number of bytes that + * were consumed once the stream ends or an uncompressed block is reached. + * Second, it allows us to stop early if the overread amount gets so large (more + * than sizeof bitbuf) that it can only be caused by a real overread. (The + * second part is arguably unneeded, since libdeflate is buffer-based; given + * infinite zeroes, it will eventually either completely fill the output buffer + * or return an error. However, we do it to be slightly more friendly to the + * not-recommended use case of decompressing with an unknown output size.) + */ +#define REFILL_BITS() \ +do { \ + if (UNALIGNED_ACCESS_IS_FAST && \ + likely(in_end - in_next >= sizeof(bitbuf_t))) { \ + REFILL_BITS_BRANCHLESS(); \ + } else { \ + while ((u8)bitsleft < CONSUMABLE_NBITS) { \ + if (likely(in_next != in_end)) { \ + bitbuf |= (bitbuf_t)*in_next++ << \ + (u8)bitsleft; \ + } else { \ + overread_count++; \ + SAFETY_CHECK(overread_count <= \ + sizeof(bitbuf_t)); \ + } \ + bitsleft += 8; \ + } \ + } \ +} while (0) + +/* + * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the + * end of the input. It can only be used in the fastloop. + */ +#define REFILL_BITS_IN_FASTLOOP() \ +do { \ + STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \ + FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \ + if (UNALIGNED_ACCESS_IS_FAST) { \ + REFILL_BITS_BRANCHLESS(); \ + } else { \ + while ((u8)bitsleft < CONSUMABLE_NBITS) { \ + bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \ + bitsleft += 8; \ + } \ + } \ +} while (0) + +/* + * This is the worst-case maximum number of output bytes that are written to + * during each iteration of the fastloop. The worst case is 2 literals, then a + * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must + * be included for the intentional overrun in the match copy implementation. + */ +#define FASTLOOP_MAX_BYTES_WRITTEN \ + (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1) + +/* + * This is the worst-case maximum number of input bytes that are read during + * each iteration of the fastloop. To get this value, we first compute the + * greatest number of bits that can be refilled during a loop iteration. The + * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can + * be refilled later is no more than the maximum amount that can be consumed by + * 2 literals that don't need a subtable, then a match. We convert this value + * to bytes, rounding up; this gives the maximum number of bytes that 'in_next' + * can be advanced. Finally, we add sizeof(bitbuf_t) to account for + * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'. + */ +#define FASTLOOP_MAX_BYTES_READ \ + (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \ + LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \ + sizeof(bitbuf_t)) + +/***************************************************************************** + * Huffman decoding * + *****************************************************************************/ + +/* + * The fastest way to decode Huffman-encoded data is basically to use a decode + * table that maps the next TABLEBITS bits of data to their symbol. Each entry + * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'. A + * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table. + * + * Ideally, TABLEBITS and the maximum codeword length would be the same; some + * compression formats are designed with this goal in mind. Unfortunately, in + * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is + * too large for a practical TABLEBITS. It's not *that* much larger, though, so + * the workaround is to use a single level of subtables. In the main table, + * entries for prefixes of codewords longer than TABLEBITS contain a "pointer" + * to the appropriate subtable along with the number of bits it is indexed with. + * + * The most efficient way to allocate subtables is to allocate them dynamically + * after the main table. The worst-case number of table entries needed, + * including subtables, is precomputable; see the ENOUGH constants below. + * + * A useful optimization is to store the codeword lengths in the decode table so + * that they don't have to be looked up by indexing a separate table that maps + * symbols to their codeword lengths. We basically do this; however, for the + * litlen and offset codes we also implement some DEFLATE-specific optimizations + * that build in the consideration of the "extra bits" and the + * literal/length/end-of-block division. For the exact decode table entry + * format we use, see the definitions of the *_decode_results[] arrays below. + */ + + +/* + * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes, + * along with their corresponding ENOUGH values. + * + * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum + * precode codeword length. This avoids ever needing subtables. + * + * For the litlen and offset codes, we cannot realistically avoid ever needing + * subtables, since litlen and offset codewords can be up to 15 bits. A higher + * TABLEBITS reduces the number of lookups that need a subtable, which increases + * performance; however, it increases memory usage and makes building the table + * take longer, which decreases performance. We choose values that work well in + * practice, making subtables rarely needed without making the tables too large. + * + * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special + * considerations, 9 would fit the trade-off curve better. However, there is a + * performance benefit to using exactly 8 bits when it is a compile-time + * constant, as many CPUs can take the low byte more easily than the low 9 bits. + * + * zlib treats its equivalents of TABLEBITS as maximum values; whenever it + * builds a table, it caps the actual table_bits to the longest codeword. This + * makes sense in theory, as there's no need for the table to be any larger than + * needed to support the longest codeword. However, having the table bits be a + * compile-time constant is beneficial to the performance of the decode loop, so + * there is a trade-off. libdeflate currently uses the dynamic table_bits + * strategy for the litlen table only, due to its larger maximum size. + * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there + * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above. + * + * Each TABLEBITS value has a corresponding ENOUGH value that gives the + * worst-case maximum number of decode table entries, including the main table + * and all subtables. The ENOUGH value depends on three parameters: + * + * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) + * (2) the maximum number of main table bits (*_TABLEBITS) + * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) + * + * The ENOUGH values were computed using the utility program 'enough' from zlib. + */ +#define PRECODE_TABLEBITS 7 +#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ +#define LITLEN_TABLEBITS 11 +#define LITLEN_ENOUGH 2342 /* enough 288 11 15 */ +#define OFFSET_TABLEBITS 8 +#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ + +/* + * make_decode_table_entry() creates a decode table entry for the given symbol + * by combining the static part 'decode_results[sym]' with the dynamic part + * 'len', which is the remaining codeword length (the codeword length for main + * table entries, or the codeword length minus TABLEBITS for subtable entries). + * + * In all cases, we add 'len' to each of the two low-order bytes to create the + * appropriately-formatted decode table entry. See the definitions of the + * *_decode_results[] arrays below, where the entry format is described. + */ +static u32 +make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len) +{ + return decode_results[sym] + (len << 8) + len; +} + +/* + * Here is the format of our precode decode table entries. Bits not explicitly + * described contain zeroes: + * + * Bit 20-16: presym + * Bit 10-8: codeword length [not used] + * Bit 2-0: codeword length + * + * The precode decode table never has subtables, since we use + * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN. + * + * precode_decode_results[] contains the static part of the entry for each + * symbol. make_decode_table_entry() produces the final entries. + */ +static const u32 precode_decode_results[] = { +#define ENTRY(presym) ((u32)presym << 16) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , +#undef ENTRY +}; + +/* Litlen and offset decode table entry flags */ + +/* Indicates a literal entry in the litlen decode table */ +#define HUFFDEC_LITERAL 0x80000000 + +/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */ +#define HUFFDEC_EXCEPTIONAL 0x00008000 + +/* Indicates a subtable pointer entry in the litlen or offset decode table */ +#define HUFFDEC_SUBTABLE_POINTER 0x00004000 + +/* Indicates an end-of-block entry in the litlen decode table */ +#define HUFFDEC_END_OF_BLOCK 0x00002000 + +/* Maximum number of bits that can be consumed by decoding a match length */ +#define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) +#define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) + +/* + * Here is the format of our litlen decode table entries. Bits not explicitly + * described contain zeroes: + * + * Literals: + * Bit 31: 1 (HUFFDEC_LITERAL) + * Bit 23-16: literal value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Lengths: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 24-16: length base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * End of block: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 1 (HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Subtable pointer: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 30-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits + * + * This format has several desirable properties: + * + * - The codeword length, length slot base, and number of extra length bits + * are all built in. This eliminates the need to separately look up this + * information by indexing separate arrays by symbol or length slot. + * + * - The HUFFDEC_* flags enable easily distinguishing between the different + * types of entries. The HUFFDEC_LITERAL flag enables a fast path for + * literals; the high bit is used for this, as some CPUs can test the + * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag + * makes it possible to detect the two unlikely cases (subtable pointer + * and end of block) in a single bit flag test. + * + * - The low byte is the number of bits that need to be removed from the + * bitstream; this makes this value easily accessible, and it enables the + * micro-optimization of doing 'bitsleft -= entry' instead of + * 'bitsleft -= (u8)entry'. It also includes the number of extra bits, + * so they don't need to be removed separately. + * + * - The flags in bits 15-13 are arranged to be 0 when the + * "remaining codeword length" in bits 11-8 is needed, making this value + * fairly easily accessible as well via a shift and downcast. + * + * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are + * needed, making it possible to extract this value with '& 0x3F' rather + * than '& 0xF'. This value is only used as a shift amount, so this can + * save an 'and' instruction as the masking by 0x3F happens implicitly. + * + * litlen_decode_results[] contains the static part of the entry for each + * symbol. make_decode_table_entry() produces the final entries. + */ +static const u32 litlen_decode_results[] = { + + /* Literals */ +#define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16)) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , + ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , + ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , + ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , + ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , + ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , + ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , + ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , + ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , + ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , + ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , + ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , + ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , + ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , + ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , + ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , + ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , + ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , + ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , + ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , + ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , + ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , + ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , + ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , + ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , + ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , + ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , + ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , + ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , + ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , + ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , + ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , + ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , + ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , + ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , + ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , + ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , + ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , + ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , + ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , + ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , + ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , + ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , + ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , + ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , + ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , + ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , + ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , + ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , + ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , + ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , + ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , + ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , + ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , + ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , + ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , + ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , + ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , + ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , + ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , +#undef ENTRY + + /* End of block */ + HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK, + + /* Lengths */ +#define ENTRY(length_base, num_extra_bits) \ + (((u32)(length_base) << 16) | (num_extra_bits)) + ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), + ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), + ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), + ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), + ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), + ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), + ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), + ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , +#undef ENTRY +}; + +/* Maximum number of bits that can be consumed by decoding a match offset */ +#define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS) +#define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS) + +/* + * Here is the format of our offset decode table entries. Bits not explicitly + * described contain zeroes: + * + * Offsets: + * Bit 31-16: offset base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * Subtable pointer: + * Bit 31-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits + * + * These work the same way as the length entries and subtable pointer entries in + * the litlen decode table; see litlen_decode_results[] above. + */ +static const u32 offset_decode_results[] = { +#define ENTRY(offset_base, num_extra_bits) \ + (((u32)(offset_base) << 16) | (num_extra_bits)) + ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , + ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , + ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , + ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , + ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , + ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , + ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , + ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , +#undef ENTRY +}; + +/* + * The main DEFLATE decompressor structure. Since libdeflate only supports + * full-buffer decompression, this structure doesn't store the entire + * decompression state, most of which is in stack variables. Instead, this + * struct just contains the decode tables and some temporary arrays used for + * building them, as these are too large to comfortably allocate on the stack. + * + * Storing the decode tables in the decompressor struct also allows the decode + * tables for the static codes to be reused whenever two static Huffman blocks + * are decoded without an intervening dynamic block, even across streams. + */ +struct libdeflate_decompressor { + + /* + * The arrays aren't all needed at the same time. 'precode_lens' and + * 'precode_decode_table' are unneeded after 'lens' has been filled. + * Furthermore, 'lens' need not be retained after building the litlen + * and offset decode tables. In fact, 'lens' can be in union with + * 'litlen_decode_table' provided that 'offset_decode_table' is separate + * and is built first. + */ + + union { + u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + + struct { + u8 lens[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS + + DEFLATE_MAX_LENS_OVERRUN]; + + u32 precode_decode_table[PRECODE_ENOUGH]; + } l; + + u32 litlen_decode_table[LITLEN_ENOUGH]; + } u; + + u32 offset_decode_table[OFFSET_ENOUGH]; + + /* used only during build_decode_table() */ + u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; + + bool static_codes_loaded; + unsigned litlen_tablebits; +}; + +/* + * Build a table for fast decoding of symbols from a Huffman code. As input, + * this function takes the codeword length of each symbol which may be used in + * the code. As output, it produces a decode table for the canonical Huffman + * code described by the codeword lengths. The decode table is built with the + * assumption that it will be indexed with "bit-reversed" codewords, where the + * low-order bit is the first bit of the codeword. This format is used for all + * Huffman codes in DEFLATE. + * + * @decode_table + * The array in which the decode table will be generated. This array must + * have sufficient length; see the definition of the ENOUGH numbers. + * @lens + * An array which provides, for each symbol, the length of the + * corresponding codeword in bits, or 0 if the symbol is unused. This may + * alias @decode_table, since nothing is written to @decode_table until all + * @lens have been consumed. All codeword lengths are assumed to be <= + * @max_codeword_len but are otherwise considered untrusted. If they do + * not form a valid Huffman code, then the decode table is not built and + * %false is returned. + * @num_syms + * The number of symbols in the code, including all unused symbols. + * @decode_results + * An array which gives the incomplete decode result for each symbol. The + * needed values in this array will be combined with codeword lengths to + * make the final decode table entries using make_decode_table_entry(). + * @table_bits + * The log base-2 of the number of main table entries to use. + * If @table_bits_ret != NULL, then @table_bits is treated as a maximum + * value and it will be decreased if a smaller table would be sufficient. + * @max_codeword_len + * The maximum allowed codeword length for this Huffman code. + * Must be <= DEFLATE_MAX_CODEWORD_LEN. + * @sorted_syms + * A temporary array of length @num_syms. + * @table_bits_ret + * If non-NULL, then the dynamic table_bits is enabled, and the actual + * table_bits value will be returned here. + * + * Returns %true if successful; %false if the codeword lengths do not form a + * valid Huffman code. + */ +static bool +build_decode_table(u32 decode_table[], + const u8 lens[], + const unsigned num_syms, + const u32 decode_results[], + unsigned table_bits, + unsigned max_codeword_len, + u16 *sorted_syms, + unsigned *table_bits_ret) +{ + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned sym; /* current symbol */ + unsigned codeword; /* current codeword, bit-reversed */ + unsigned len; /* current codeword length in bits */ + unsigned count; /* num codewords remaining with this length */ + u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ + unsigned cur_table_end; /* end index of current table */ + unsigned subtable_prefix; /* codeword prefix of current subtable */ + unsigned subtable_start; /* start index of current subtable */ + unsigned subtable_bits; /* log2 of current subtable length */ + + /* Count how many codewords have each length, including 0. */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* + * Determine the actual maximum codeword length that was used, and + * decrease table_bits to it if allowed. + */ + while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0) + max_codeword_len--; + if (table_bits_ret != NULL) { + table_bits = MIN(table_bits, max_codeword_len); + *table_bits_ret = table_bits; + } + + /* + * Sort the symbols primarily by increasing codeword length and + * secondarily by increasing symbol value; or equivalently by their + * codewords in lexicographic order, since a canonical code is assumed. + * + * For efficiency, also compute 'codespace_used' in the same pass over + * 'len_counts[]' used to build 'offsets[]' for sorting. + */ + + /* Ensure that 'codespace_used' cannot overflow. */ + STATIC_ASSERT(sizeof(codespace_used) == 4); + STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= + DEFLATE_MAX_NUM_SYMS); + + offsets[0] = 0; + offsets[1] = len_counts[0]; + codespace_used = 0; + for (len = 1; len < max_codeword_len; len++) { + offsets[len + 1] = offsets[len] + len_counts[len]; + codespace_used = (codespace_used << 1) + len_counts[len]; + } + codespace_used = (codespace_used << 1) + len_counts[len]; + + for (sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; + + sorted_syms += offsets[0]; /* Skip unused symbols */ + + /* lens[] is done being used, so we can write to decode_table[] now. */ + + /* + * Check whether the lengths form a complete code (exactly fills the + * codespace), an incomplete code (doesn't fill the codespace), or an + * overfull code (overflows the codespace). A codeword of length 'n' + * uses proportion '1/(2^n)' of the codespace. An overfull code is + * nonsensical, so is considered invalid. An incomplete code is + * considered valid only in two specific cases; see below. + */ + + /* overfull code? */ + if (unlikely(codespace_used > (1U << max_codeword_len))) + return false; + + /* incomplete code? */ + if (unlikely(codespace_used < (1U << max_codeword_len))) { + u32 entry; + unsigned i; + + /* + * The DEFLATE RFC explicitly allows the offset code to be + * incomplete in two cases: a code containing just 1 codeword, + * if that codeword has length 1; and a code containing no + * codewords. Note: the list of offset codeword lengths is + * always nonempty, but lengths of 0 don't count as codewords. + * + * The RFC doesn't say whether the same cases are allowed for + * the litlen and pre codes. It's actually impossible for no + * symbols to be used from these codes; however, it's + * technically possible for only one symbol to be used. zlib + * allows 1 codeword for the litlen code, but not the precode. + * The RFC also doesn't say whether, when there is 1 codeword, + * that codeword is '0' or '1'. zlib uses '0'. + * + * We accept what zlib accepts, plus a bit more. First, we + * don't treat the precode more strictly than the litlen and + * offset codes. There's no convincing reason to add a special + * case for the precode here. + * + * Second, we just map each allowed incompete code to a complete + * code with only real symbols. To do this, we choose a symbol, + * either the used symbol (for codes with 1 codeword) or an + * arbitrary symbol (for empty codes), and give it both + * codewords '0' and '1'. zlib instead uses a special ERROR + * symbol in the part of the codespace the code doesn't use. + * However, having an ERROR symbol reduces the performance of + * the Huffman decoder, for no real benefit. Our approach also + * avoids having to decide whether '0' or '1' is correct. + * + * Like zlib, we still reject all incomplete codes that contain + * more than 1 codeword or a codeword length greater than 1. + */ + if (codespace_used == 0) { + sym = 0; /* arbitrary */ + } else { + if (codespace_used != (1U << (max_codeword_len - 1)) || + len_counts[1] != 1) + return false; + sym = sorted_syms[0]; + } + entry = make_decode_table_entry(decode_results, sym, 1); + for (i = 0; i < (1U << table_bits); i++) + decode_table[i] = entry; + return true; + } + + /* + * The lengths form a complete code. Now, enumerate the codewords in + * lexicographic order and fill the decode table entries for each one. + * + * First, process all codewords with len <= table_bits. Each one gets + * '2^(table_bits-len)' direct entries in the table. + * + * Since DEFLATE uses bit-reversed codewords, these entries aren't + * consecutive but rather are spaced '2^len' entries apart. This makes + * filling them naively somewhat awkward and inefficient, since strided + * stores are less cache-friendly and preclude the use of word or + * vector-at-a-time stores to fill multiple entries per instruction. + * + * To optimize this, we incrementally double the table size. When + * processing codewords with length 'len', the table is treated as + * having only '2^len' entries, so each codeword uses just one entry. + * Then, each time 'len' is incremented, the table size is doubled and + * the first half is copied to the second half. This significantly + * improves performance over naively doing strided stores. + * + * Note that some entries copied for each table doubling may not have + * been initialized yet, but it doesn't matter since they're guaranteed + * to be initialized later (because the Huffman code is complete). + */ + codeword = 0; + len = 1; + while ((count = len_counts[len]) == 0) + len++; + cur_table_end = 1U << len; + while (len <= table_bits) { + /* Process all 'count' codewords with length 'len' bits. */ + do { + unsigned bit; + + /* Fill the first entry for the current codeword. */ + decode_table[codeword] = + make_decode_table_entry(decode_results, + *sorted_syms++, len); + + if (codeword == cur_table_end - 1) { + /* Last codeword (all 1's) */ + for (; len < table_bits; len++) { + __builtin_memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * + sizeof(decode_table[0])); + cur_table_end <<= 1; + } + return true; + } + /* + * To advance to the lexicographically next codeword in + * the canonical code, the codeword must be incremented, + * then 0's must be appended to the codeword as needed + * to match the next codeword's length. + * + * Since the codeword is bit-reversed, appending 0's is + * a no-op. However, incrementing it is nontrivial. To + * do so efficiently, use the 'bsr' instruction to find + * the last (highest order) 0 bit in the codeword, set + * it, and clear any later (higher order) 1 bits. But + * 'bsr' actually finds the highest order 1 bit, so to + * use it first flip all bits in the codeword by XOR'ing + * it with (1U << len) - 1 == cur_table_end - 1. + */ + bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); + codeword &= bit - 1; + codeword |= bit; + } while (--count); + + /* Advance to the next codeword length. */ + do { + if (++len <= table_bits) { + __builtin_memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * sizeof(decode_table[0])); + cur_table_end <<= 1; + } + } while ((count = len_counts[len]) == 0); + } + + /* Process codewords with len > table_bits. These require subtables. */ + cur_table_end = 1U << table_bits; + subtable_prefix = -1; + subtable_start = 0; + for (;;) { + u32 entry; + unsigned i; + unsigned stride; + unsigned bit; + + /* + * Start a new subtable if the first 'table_bits' bits of the + * codeword don't match the prefix of the current subtable. + */ + if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { + subtable_prefix = (codeword & ((1U << table_bits) - 1)); + subtable_start = cur_table_end; + /* + * Calculate the subtable length. If the codeword has + * length 'table_bits + n', then the subtable needs + * '2^n' entries. But it may need more; if fewer than + * '2^n' codewords of length 'table_bits + n' remain, + * then the length will need to be incremented to bring + * in longer codewords until the subtable can be + * completely filled. Note that because the Huffman + * code is complete, it will always be possible to fill + * the subtable eventually. + */ + subtable_bits = len - table_bits; + codespace_used = count; + while (codespace_used < (1U << subtable_bits)) { + subtable_bits++; + codespace_used = (codespace_used << 1) + + len_counts[table_bits + subtable_bits]; + } + cur_table_end = subtable_start + (1U << subtable_bits); + + /* + * Create the entry that points from the main table to + * the subtable. + */ + decode_table[subtable_prefix] = + ((u32)subtable_start << 16) | + HUFFDEC_EXCEPTIONAL | + HUFFDEC_SUBTABLE_POINTER | + (subtable_bits << 8) | table_bits; + } + + /* Fill the subtable entries for the current codeword. */ + entry = make_decode_table_entry(decode_results, *sorted_syms++, + len - table_bits); + i = subtable_start + (codeword >> table_bits); + stride = 1U << (len - table_bits); + do { + decode_table[i] = entry; + i += stride; + } while (i < cur_table_end); + + /* Advance to the next codeword. */ + if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ + return true; + bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); + codeword &= bit - 1; + codeword |= bit; + count--; + while (count == 0) + count = len_counts[++len]; + } +} + +/* Build the decode table for the precode. */ +static bool +build_precode_decode_table(struct libdeflate_decompressor *d) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); + + STATIC_ASSERT(ARRAY_LEN(precode_decode_results) == + DEFLATE_NUM_PRECODE_SYMS); + + return build_decode_table(d->u.l.precode_decode_table, + d->u.precode_lens, + DEFLATE_NUM_PRECODE_SYMS, + precode_decode_results, + PRECODE_TABLEBITS, + DEFLATE_MAX_PRE_CODEWORD_LEN, + d->sorted_syms, + NULL); +} + +/* Build the decode table for the literal/length code. */ +static bool +build_litlen_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342); + + STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) == + DEFLATE_NUM_LITLEN_SYMS); + + return build_decode_table(d->u.litlen_decode_table, + d->u.l.lens, + num_litlen_syms, + litlen_decode_results, + LITLEN_TABLEBITS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + d->sorted_syms, + &d->litlen_tablebits); +} + +/* Build the decode table for the offset code. */ +static bool +build_offset_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); + + STATIC_ASSERT(ARRAY_LEN(offset_decode_results) == + DEFLATE_NUM_OFFSET_SYMS); + + return build_decode_table(d->offset_decode_table, + d->u.l.lens + num_litlen_syms, + num_offset_syms, + offset_decode_results, + OFFSET_TABLEBITS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN, + d->sorted_syms, + NULL); +} + +/***************************************************************************** + * Main decompression routine + *****************************************************************************/ + +typedef enum libdeflate_result (*decompress_func_t) + (struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +#define FUNCNAME deflate_decompress_default +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 +#include "decompress_template.h" + +/* Include architecture-specific implementation(s) if available. */ +#undef DEFAULT_IMPL +#undef arch_select_decompress_func +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# include "x86/decompress_impl.h" +#endif + +#ifndef DEFAULT_IMPL +# define DEFAULT_IMPL deflate_decompress_default +#endif + +#ifdef arch_select_decompress_func +static enum libdeflate_result +dispatch_decomp(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +static volatile decompress_func_t decompress_impl = dispatch_decomp; + +/* Choose the best implementation at runtime. */ +static enum libdeflate_result +dispatch_decomp(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + decompress_func_t f = arch_select_decompress_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + decompress_impl = f; + return f(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} +#else +/* The best implementation is statically known, so call it directly. */ +# define decompress_impl DEFAULT_IMPL +#endif + +/* + * This is the main DEFLATE decompression routine. See libdeflate.h for the + * documentation. + * + * Note that the real code is in decompress_template.h. The part here just + * handles calling the appropriate implementation depending on the CPU features + * at runtime. + */ +enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} + +enum libdeflate_result +libdeflate_deflate_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_deflate_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} + +struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options) +{ + struct libdeflate_decompressor *d; + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + d = __malloc(sizeof(*d)); + if (d == NULL) + return NULL; + /* + * Note that only certain parts of the decompressor actually must be + * initialized here: + * + * - 'static_codes_loaded' must be initialized to false. + * + * - The first half of the main portion of each decode table must be + * initialized to any value, to avoid reading from uninitialized + * memory during table expansion in build_decode_table(). (Although, + * this is really just to avoid warnings with dynamic tools like + * valgrind, since build_decode_table() is guaranteed to initialize + * all entries eventually anyway.) + * + * But for simplicity, we currently just zero the whole decompressor. + */ + __builtin_memset(d, 0, sizeof(*d)); + return d; +} + +struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_decompressor_ex(&defaults); +} + +void +libdeflate_free_decompressor(struct libdeflate_decompressor *d) +{ + if (d) + __free(d); +} diff --git a/packages/wasm/lib/libdeflate/deflate_decompress.h b/packages/wasm/lib/libdeflate/deflate_decompress.h new file mode 100644 index 00000000..754fdba7 --- /dev/null +++ b/packages/wasm/lib/libdeflate/deflate_decompress.h @@ -0,0 +1,14 @@ +#ifndef LIB_DEFLATE_COMPRESS_H +#define LIB_DEFLATE_COMPRESS_H + +#include "lib_common.h" + +enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + + +#endif /* LIB_DEFLATE_COMPRESS_H */ diff --git a/packages/wasm/lib/libdeflate/gzip_constants.h b/packages/wasm/lib/libdeflate/gzip_constants.h new file mode 100644 index 00000000..35e4728d --- /dev/null +++ b/packages/wasm/lib/libdeflate/gzip_constants.h @@ -0,0 +1,45 @@ +/* + * gzip_constants.h - constants for the gzip wrapper format + */ + +#ifndef LIB_GZIP_CONSTANTS_H +#define LIB_GZIP_CONSTANTS_H + +#define GZIP_MIN_HEADER_SIZE 10 +#define GZIP_FOOTER_SIZE 8 +#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) + +#define GZIP_ID1 0x1F +#define GZIP_ID2 0x8B + +#define GZIP_CM_DEFLATE 8 + +#define GZIP_FTEXT 0x01 +#define GZIP_FHCRC 0x02 +#define GZIP_FEXTRA 0x04 +#define GZIP_FNAME 0x08 +#define GZIP_FCOMMENT 0x10 +#define GZIP_FRESERVED 0xE0 + +#define GZIP_MTIME_UNAVAILABLE 0 + +#define GZIP_XFL_SLOWEST_COMPRESSION 0x02 +#define GZIP_XFL_FASTEST_COMPRESSION 0x04 + +#define GZIP_OS_FAT 0 +#define GZIP_OS_AMIGA 1 +#define GZIP_OS_VMS 2 +#define GZIP_OS_UNIX 3 +#define GZIP_OS_VM_CMS 4 +#define GZIP_OS_ATARI_TOS 5 +#define GZIP_OS_HPFS 6 +#define GZIP_OS_MACINTOSH 7 +#define GZIP_OS_Z_SYSTEM 8 +#define GZIP_OS_CP_M 9 +#define GZIP_OS_TOPS_20 10 +#define GZIP_OS_NTFS 11 +#define GZIP_OS_QDOS 12 +#define GZIP_OS_RISCOS 13 +#define GZIP_OS_UNKNOWN 255 + +#endif /* LIB_GZIP_CONSTANTS_H */ diff --git a/packages/wasm/lib/libdeflate/gzip_decompress.c b/packages/wasm/lib/libdeflate/gzip_decompress.c new file mode 100644 index 00000000..3a4fabe6 --- /dev/null +++ b/packages/wasm/lib/libdeflate/gzip_decompress.c @@ -0,0 +1,160 @@ +/* + * gzip_decompress.c - decompress with a gzip wrapper + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "lib_common.h" +#include "gzip_constants.h" +#ifdef CRC32 +#include "crc32.h" +#endif + +enum libdeflate_result +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u8 flg; + size_t actual_in_nbytes; + size_t actual_out_nbytes; + enum libdeflate_result result; + + if (in_nbytes < GZIP_MIN_OVERHEAD) + return LIBDEFLATE_BAD_DATA; + + /* ID1 */ + if (*in_next++ != GZIP_ID1) + return LIBDEFLATE_BAD_DATA; + /* ID2 */ + if (*in_next++ != GZIP_ID2) + return LIBDEFLATE_BAD_DATA; + /* CM */ + if (*in_next++ != GZIP_CM_DEFLATE) + return LIBDEFLATE_BAD_DATA; + flg = *in_next++; + /* MTIME */ + in_next += 4; + /* XFL */ + in_next += 1; + /* OS */ + in_next += 1; + + if (flg & GZIP_FRESERVED) + return LIBDEFLATE_BAD_DATA; + + /* Extra field */ + if (flg & GZIP_FEXTRA) { + u16 xlen = get_unaligned_le16(in_next); + in_next += 2; + + if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + + in_next += xlen; + } + + /* Original file name (zero terminated) */ + if (flg & GZIP_FNAME) { + while (*in_next++ != 0 && in_next != in_end) + ; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* File comment (zero terminated) */ + if (flg & GZIP_FCOMMENT) { + while (*in_next++ != 0 && in_next != in_end) + ; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* CRC16 for gzip header */ + if (flg & GZIP_FHCRC) { + in_next += 2; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* Compressed data */ + result = libdeflate_deflate_decompress_ex(d, in_next, + in_end - GZIP_FOOTER_SIZE - in_next, + out, out_nbytes_avail, + &actual_in_nbytes, + actual_out_nbytes_ret); + if (result != LIBDEFLATE_SUCCESS) + return result; + + if (actual_out_nbytes_ret) + actual_out_nbytes = *actual_out_nbytes_ret; + else + actual_out_nbytes = out_nbytes_avail; + + in_next += actual_in_nbytes; + + /* CRC32 */ + #ifdef CRC32 + // this library is supposed to be used for MTProto + // there's no need to check for CRC32, since the data is guaranteed to be correct + // by the protocol itself. not including crc32 implementation allows us to + // save around 8kb of code size + if (libdeflate_crc32(0, out, actual_out_nbytes) != + get_unaligned_le32(in_next)) + return LIBDEFLATE_BAD_DATA; + #endif + in_next += 4; + + /* ISIZE */ + if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + return LIBDEFLATE_SUCCESS; +} + +LIBDEFLATEAPI int32_t +libdeflate_gzip_get_output_size(const void* in, size_t in_nbytes) { + return get_unaligned_le32((u8*)in + in_nbytes - 4); +} + +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + // we're using `libdeflate_zlib_get_output_size` to allocate exactly the + // right amount of memory for the output buffer, so this is redundant + size_t actual_out_nbytes_ret; + return libdeflate_gzip_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, &actual_out_nbytes_ret); +} diff --git a/packages/wasm/lib/libdeflate/hc_matchfinder.h b/packages/wasm/lib/libdeflate/hc_matchfinder.h new file mode 100644 index 00000000..edf4e277 --- /dev/null +++ b/packages/wasm/lib/libdeflate/hc_matchfinder.h @@ -0,0 +1,401 @@ +/* + * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * Algorithm + * + * This is a Hash Chains (hc) based matchfinder. + * + * The main data structure is a hash table where each hash bucket contains a + * linked list (or "chain") of sequences whose first 4 bytes share the same hash + * code. Each sequence is identified by its starting position in the input + * buffer. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 4 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, this hash + * bucket's linked list is searched for matches. Then, a new linked list node + * is created to represent the current sequence and is prepended to the list. + * + * This algorithm has several useful properties: + * + * - It only finds true Lempel-Ziv matches; i.e., those where the matching + * sequence occurs prior to the sequence being matched against. + * + * - The sequences in each linked list are always sorted by decreasing starting + * position. Therefore, the closest (smallest offset) matches are found + * first, which in many compression formats tend to be the cheapest to encode. + * + * - Although fast running time is not guaranteed due to the possibility of the + * lists getting very long, the worst degenerate behavior can be easily + * prevented by capping the number of nodes searched at each position. + * + * - If the compressor decides not to search for matches at a certain position, + * then that position can be quickly inserted without searching the list. + * + * - The algorithm is adaptable to sliding windows: just store the positions + * relative to a "base" value that is updated from time to time, and stop + * searching each list when the sequences get too far away. + * + * ---------------------------------------------------------------------------- + * + * Optimizations + * + * The main hash table and chains handle length 4+ matches. Length 3 matches + * are handled by a separate hash table with no chains. This works well for + * typical "greedy" or "lazy"-style compressors, where length 3 matches are + * often only helpful if they have small offsets. Instead of searching a full + * chain for length 3+ matches, the algorithm just checks for one close length 3 + * match, then focuses on finding length 4+ matches. + * + * The longest_match() and skip_bytes() functions are inlined into the + * compressors that use them. This isn't just about saving the overhead of a + * function call. These functions are intended to be called from the inner + * loops of compressors, where giving the compiler more control over register + * allocation is very helpful. There is also significant benefit to be gained + * from allowing the CPU to predict branches independently at each call site. + * For example, "lazy"-style compressors can be written with two calls to + * longest_match(), each of which starts with a different 'best_len' and + * therefore has significantly different performance characteristics. + * + * Although any hash function can be used, a multiplicative hash is fast and + * works well. + * + * On some processors, it is significantly faster to extend matches by whole + * words (32 or 64 bits) instead of by individual bytes. For this to be the + * case, the processor must implement unaligned memory accesses efficiently and + * must have either a fast "find first set bit" instruction or a fast "find last + * set bit" instruction, depending on the processor's endianness. + * + * The code uses one loop for finding the first match and one loop for finding a + * longer match. Each of these loops is tuned for its respective task and in + * combination are faster than a single generalized loop that handles both + * tasks. + * + * The code also uses a tight inner loop that only compares the last and first + * bytes of a potential match. It is only when these bytes match that a full + * match extension is attempted. + * + * ---------------------------------------------------------------------------- + */ + +#ifndef LIB_HC_MATCHFINDER_H +#define LIB_HC_MATCHFINDER_H + +#include "matchfinder_common.h" + +#define HC_MATCHFINDER_HASH3_ORDER 15 +#define HC_MATCHFINDER_HASH4_ORDER 16 + +#define HC_MATCHFINDER_TOTAL_HASH_SIZE \ + (((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ + (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) + +struct MATCHFINDER_ALIGNED hc_matchfinder { + + /* The hash table for finding length 3 matches */ + mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; + + /* The hash table which contains the first nodes of the linked lists for + * finding length 4+ matches */ + mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; + + /* The "next node" references for the linked lists. The "next node" of + * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ + mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; +}; + +/* Prepare the matchfinder for a new input buffer. */ +static void +hc_matchfinder_init(struct hc_matchfinder *mf) +{ + STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE % + MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE); +} + +static void +hc_matchfinder_slide_window(struct hc_matchfinder *mf) +{ + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); +} + +/* + * Find the longest match longer than 'best_len' bytes. + * + * @mf + * The matchfinder structure. + * @in_base_p + * Location of a pointer which points to the place in the input data the + * matchfinder currently stores positions relative to. This may be updated + * by this function. + * @in_next + * Pointer to the next position in the input buffer, i.e. the sequence + * being matched against. + * @best_len + * Require a match longer than this length. + * @max_len + * The maximum permissible match length at this position. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. + * @offset_ret + * If a match is found, its offset is returned in this location. + * + * Return the length of the match found, or 'best_len' if no match longer than + * 'best_len' was found. + */ +static u32 +hc_matchfinder_longest_match(struct hc_matchfinder * const mf, + const u8 ** const in_base_p, + const u8 * const in_next, + u32 best_len, + const u32 max_len, + const u32 nice_len, + const u32 max_search_depth, + u32 * const next_hashes, + u32 * const offset_ret) +{ + u32 depth_remaining = max_search_depth; + const u8 *best_matchptr = in_next; + mf_pos_t cur_node3, cur_node4; + u32 hash3, hash4; + u32 next_hashseq; + u32 seq4; + const u8 *matchptr; + u32 len; + u32 cur_pos = in_next - *in_base_p; + const u8 *in_base; + mf_pos_t cutoff; + + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + hc_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + + in_base = *in_base_p; + cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; + + if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */ + goto out; + + /* Get the precomputed hash codes. */ + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + + /* From the hash buckets, get the first node of each linked list. */ + cur_node3 = mf->hash3_tab[hash3]; + cur_node4 = mf->hash4_tab[hash4]; + + /* Update for length 3 matches. This replaces the singleton node in the + * 'hash3' bucket with the node for the current sequence. */ + mf->hash3_tab[hash3] = cur_pos; + + /* Update for length 4 matches. This prepends the node for the current + * sequence to the linked list in the 'hash4' bucket. */ + mf->hash4_tab[hash4] = cur_pos; + mf->next_tab[cur_pos] = cur_node4; + + /* Compute the next hash codes. */ + next_hashseq = get_unaligned_le32(in_next + 1); + next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + + if (best_len < 4) { /* No match of length >= 4 found yet? */ + + /* Check for a length 3 match if needed. */ + + if (cur_node3 <= cutoff) + goto out; + + seq4 = load_u32_unaligned(in_next); + + if (best_len < 3) { + matchptr = &in_base[cur_node3]; + if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { + best_len = 3; + best_matchptr = matchptr; + } + } + + /* Check for a length 4 match. */ + + if (cur_node4 <= cutoff) + goto out; + + for (;;) { + /* No length 4 match found yet. Check the first 4 bytes. */ + matchptr = &in_base[cur_node4]; + + if (load_u32_unaligned(matchptr) == seq4) + break; + + /* The first 4 bytes did not match. Keep trying. */ + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } + + /* Found a match of length >= 4. Extend it to its full length. */ + best_matchptr = matchptr; + best_len = lz_extend(in_next, best_matchptr, 4, max_len); + if (best_len >= nice_len) + goto out; + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } else { + if (cur_node4 <= cutoff || best_len >= nice_len) + goto out; + } + + /* Check for matches of length >= 5. */ + + for (;;) { + for (;;) { + matchptr = &in_base[cur_node4]; + + /* Already found a length 4 match. Try for a longer + * match; start by checking either the last 4 bytes and + * the first 4 bytes, or the last byte. (The last byte, + * the one which would extend the match length by 1, is + * the most important.) */ + #if UNALIGNED_ACCESS_IS_FAST + if ((load_u32_unaligned(matchptr + best_len - 3) == + load_u32_unaligned(in_next + best_len - 3)) && + (load_u32_unaligned(matchptr) == + load_u32_unaligned(in_next))) + #else + if (matchptr[best_len] == in_next[best_len]) + #endif + break; + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } + + #if UNALIGNED_ACCESS_IS_FAST + len = 4; + #else + len = 0; + #endif + len = lz_extend(in_next, matchptr, len, max_len); + if (len > best_len) { + /* This is the new longest match. */ + best_len = len; + best_matchptr = matchptr; + if (best_len >= nice_len) + goto out; + } + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } +out: + *offset_ret = in_next - best_matchptr; + return best_len; +} + +/* + * Advance the matchfinder, but don't search for matches. + * + * @mf + * The matchfinder structure. + * @in_base_p + * Location of a pointer which points to the place in the input data the + * matchfinder currently stores positions relative to. This may be updated + * by this function. + * @in_next + * Pointer to the next position in the input buffer. + * @in_end + * Pointer to the end of the input buffer. + * @count + * The number of bytes to advance. Must be > 0. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + @count. + */ +static void +hc_matchfinder_skip_bytes(struct hc_matchfinder * const mf, + const u8 ** const in_base_p, + const u8 *in_next, + const u8 * const in_end, + const u32 count, + u32 * const next_hashes) +{ + u32 cur_pos; + u32 hash3, hash4; + u32 next_hashseq; + u32 remaining = count; + + if (unlikely(count + 5 > in_end - in_next)) + return; + + cur_pos = in_next - *in_base_p; + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + do { + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + hc_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + mf->hash3_tab[hash3] = cur_pos; + mf->next_tab[cur_pos] = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; + + next_hashseq = get_unaligned_le32(++in_next); + hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); + hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); + cur_pos++; + } while (--remaining); + + prefetchw(&mf->hash3_tab[hash3]); + prefetchw(&mf->hash4_tab[hash4]); + next_hashes[0] = hash3; + next_hashes[1] = hash4; +} + +#endif /* LIB_HC_MATCHFINDER_H */ diff --git a/packages/wasm/lib/libdeflate/ht_matchfinder.h b/packages/wasm/lib/libdeflate/ht_matchfinder.h new file mode 100644 index 00000000..6437492f --- /dev/null +++ b/packages/wasm/lib/libdeflate/ht_matchfinder.h @@ -0,0 +1,234 @@ +/* + * ht_matchfinder.h - Lempel-Ziv matchfinding with a hash table + * + * Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * This is a Hash Table (ht) matchfinder. + * + * This is a variant of the Hash Chains (hc) matchfinder that is optimized for + * very fast compression. The ht_matchfinder stores the hash chains inline in + * the hash table, whereas the hc_matchfinder stores them in a separate array. + * Storing the hash chains inline is the faster method when max_search_depth + * (the maximum chain length) is very small. It is not appropriate when + * max_search_depth is larger, as then it uses too much memory. + * + * Due to its focus on speed, the ht_matchfinder doesn't support length 3 + * matches. It also doesn't allow max_search_depth to vary at runtime; it is + * fixed at build time as HT_MATCHFINDER_BUCKET_SIZE. + * + * See hc_matchfinder.h for more information. + */ + +#ifndef LIB_HT_MATCHFINDER_H +#define LIB_HT_MATCHFINDER_H + +#include "matchfinder_common.h" + +#define HT_MATCHFINDER_HASH_ORDER 15 +#define HT_MATCHFINDER_BUCKET_SIZE 2 + +#define HT_MATCHFINDER_MIN_MATCH_LEN 4 +/* Minimum value of max_len for ht_matchfinder_longest_match() */ +#define HT_MATCHFINDER_REQUIRED_NBYTES 5 + +struct MATCHFINDER_ALIGNED ht_matchfinder { + mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER] + [HT_MATCHFINDER_BUCKET_SIZE]; +}; + +static void +ht_matchfinder_init(struct ht_matchfinder *mf) +{ + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, sizeof(*mf)); +} + +static void +ht_matchfinder_slide_window(struct ht_matchfinder *mf) +{ + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); +} + +/* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */ +static u32 +ht_matchfinder_longest_match(struct ht_matchfinder * const mf, + const u8 ** const in_base_p, + const u8 * const in_next, + const u32 max_len, + const u32 nice_len, + u32 * const next_hash, + u32 * const offset_ret) +{ + u32 best_len = 0; + const u8 *best_matchptr = in_next; + u32 cur_pos = in_next - *in_base_p; + const u8 *in_base; + mf_pos_t cutoff; + u32 hash; + u32 seq; + mf_pos_t cur_node; + const u8 *matchptr; +#if HT_MATCHFINDER_BUCKET_SIZE > 1 + mf_pos_t to_insert; + u32 len; +#endif +#if HT_MATCHFINDER_BUCKET_SIZE > 2 + int i; +#endif + + /* This is assumed throughout this function. */ + STATIC_ASSERT(HT_MATCHFINDER_MIN_MATCH_LEN == 4); + + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + ht_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + in_base = *in_base_p; + cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; + + hash = *next_hash; + STATIC_ASSERT(HT_MATCHFINDER_REQUIRED_NBYTES == 5); + *next_hash = lz_hash(get_unaligned_le32(in_next + 1), + HT_MATCHFINDER_HASH_ORDER); + seq = load_u32_unaligned(in_next); + prefetchw(&mf->hash_tab[*next_hash]); +#if HT_MATCHFINDER_BUCKET_SIZE == 1 + /* Hand-unrolled version for BUCKET_SIZE == 1 */ + cur_node = mf->hash_tab[hash][0]; + mf->hash_tab[hash][0] = cur_pos; + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq) { + best_len = lz_extend(in_next, matchptr, 4, max_len); + best_matchptr = matchptr; + } +#elif HT_MATCHFINDER_BUCKET_SIZE == 2 + /* + * Hand-unrolled version for BUCKET_SIZE == 2. The logic here also + * differs slightly in that it copies the first entry to the second even + * if nice_len is reached on the first, as this can be slightly faster. + */ + cur_node = mf->hash_tab[hash][0]; + mf->hash_tab[hash][0] = cur_pos; + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + + to_insert = cur_node; + cur_node = mf->hash_tab[hash][1]; + mf->hash_tab[hash][1] = to_insert; + + if (load_u32_unaligned(matchptr) == seq) { + best_len = lz_extend(in_next, matchptr, 4, max_len); + best_matchptr = matchptr; + if (cur_node <= cutoff || best_len >= nice_len) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq && + load_u32_unaligned(matchptr + best_len - 3) == + load_u32_unaligned(in_next + best_len - 3)) { + len = lz_extend(in_next, matchptr, 4, max_len); + if (len > best_len) { + best_len = len; + best_matchptr = matchptr; + } + } + } else { + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq) { + best_len = lz_extend(in_next, matchptr, 4, max_len); + best_matchptr = matchptr; + } + } +#else + /* Generic version for HT_MATCHFINDER_BUCKET_SIZE > 2 */ + to_insert = cur_pos; + for (i = 0; i < HT_MATCHFINDER_BUCKET_SIZE; i++) { + cur_node = mf->hash_tab[hash][i]; + mf->hash_tab[hash][i] = to_insert; + if (cur_node <= cutoff) + goto out; + matchptr = &in_base[cur_node]; + if (load_u32_unaligned(matchptr) == seq) { + len = lz_extend(in_next, matchptr, 4, max_len); + if (len > best_len) { + best_len = len; + best_matchptr = matchptr; + if (best_len >= nice_len) + goto out; + } + } + to_insert = cur_node; + } +#endif +out: + *offset_ret = in_next - best_matchptr; + return best_len; +} + +static void +ht_matchfinder_skip_bytes(struct ht_matchfinder * const mf, + const u8 ** const in_base_p, + const u8 *in_next, + const u8 * const in_end, + const u32 count, + u32 * const next_hash) +{ + s32 cur_pos = in_next - *in_base_p; + u32 hash; + u32 remaining = count; + int i; + + if (unlikely(count + HT_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next)) + return; + + if (cur_pos + count - 1 >= MATCHFINDER_WINDOW_SIZE) { + ht_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos -= MATCHFINDER_WINDOW_SIZE; + } + + hash = *next_hash; + do { + for (i = HT_MATCHFINDER_BUCKET_SIZE - 1; i > 0; i--) + mf->hash_tab[hash][i] = mf->hash_tab[hash][i - 1]; + mf->hash_tab[hash][0] = cur_pos; + + hash = lz_hash(get_unaligned_le32(++in_next), + HT_MATCHFINDER_HASH_ORDER); + cur_pos++; + } while (--remaining); + + prefetchw(&mf->hash_tab[hash]); + *next_hash = hash; +} + +#endif /* LIB_HT_MATCHFINDER_H */ diff --git a/packages/wasm/lib/libdeflate/matchfinder_common.h b/packages/wasm/lib/libdeflate/matchfinder_common.h new file mode 100644 index 00000000..07c44673 --- /dev/null +++ b/packages/wasm/lib/libdeflate/matchfinder_common.h @@ -0,0 +1,194 @@ +/* + * matchfinder_common.h - common code for Lempel-Ziv matchfinding + */ + +#ifndef LIB_MATCHFINDER_COMMON_H +#define LIB_MATCHFINDER_COMMON_H + +#include "lib_common.h" + +#ifndef MATCHFINDER_WINDOW_ORDER +# error "MATCHFINDER_WINDOW_ORDER must be defined!" +#endif + +/* + * Given a 32-bit value that was loaded with the platform's native endianness, + * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 + * bits contain the first 3 bytes, arranged in octets in a platform-dependent + * order, at the memory location from which the input 32-bit value was loaded. + */ +static u32 +loaded_u32_to_u24(u32 v) +{ + if (CPU_IS_LITTLE_ENDIAN()) + return v & 0xFFFFFF; + else + return v >> 8; +} + +/* + * Load the next 3 bytes from @p into the 24 low-order bits of a 32-bit value. + * The order in which the 3 bytes will be arranged as octets in the 24 bits is + * platform-dependent. At least 4 bytes (not 3) must be available at @p. + */ +static u32 +load_u24_unaligned(const u8 *p) +{ +#if UNALIGNED_ACCESS_IS_FAST + return loaded_u32_to_u24(load_u32_unaligned(p)); +#else + if (CPU_IS_LITTLE_ENDIAN()) + return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); + else + return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); +#endif +} + +#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER) + +typedef s16 mf_pos_t; + +#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) + +/* + * Required alignment of the matchfinder buffer pointer and size. The values + * here come from the AVX-2 implementation, which is the worst case. + */ +#define MATCHFINDER_MEM_ALIGNMENT 32 +#define MATCHFINDER_SIZE_ALIGNMENT 128 + +#undef matchfinder_init +#undef matchfinder_rebase +#ifdef _aligned_attribute +# define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) +#else +# define MATCHFINDER_ALIGNED +#endif + +/* + * Initialize the hash table portion of the matchfinder. + * + * Essentially, this is an optimized memset(). + * + * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and + * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. + */ +#ifndef matchfinder_init +static void +matchfinder_init(mf_pos_t *data, size_t size) +{ + size_t num_entries = size / sizeof(*data); + size_t i; + + for (i = 0; i < num_entries; i++) + data[i] = MATCHFINDER_INITVAL; +} +#endif + +/* + * Slide the matchfinder by MATCHFINDER_WINDOW_SIZE bytes. + * + * This must be called just after each MATCHFINDER_WINDOW_SIZE bytes have been + * run through the matchfinder. + * + * This subtracts MATCHFINDER_WINDOW_SIZE bytes from each entry in the given + * array, making the entries be relative to the current position rather than the + * position MATCHFINDER_WINDOW_SIZE bytes prior. To avoid integer underflows, + * entries that would become less than -MATCHFINDER_WINDOW_SIZE stay at + * -MATCHFINDER_WINDOW_SIZE, keeping them permanently out of bounds. + * + * The given array must contain all matchfinder data that is position-relative: + * the hash table(s) as well as any hash chain or binary tree links. Its + * address must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and its size + * must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. + */ +#ifndef matchfinder_rebase +static void +matchfinder_rebase(mf_pos_t *data, size_t size) +{ + size_t num_entries = size / sizeof(*data); + size_t i; + + if (MATCHFINDER_WINDOW_SIZE == 32768) { + /* + * Branchless version for 32768-byte windows. Clear all bits if + * the value was already negative, then set the sign bit. This + * is equivalent to subtracting 32768 with signed saturation. + */ + for (i = 0; i < num_entries; i++) + data[i] = 0x8000 | (data[i] & ~(data[i] >> 15)); + } else { + for (i = 0; i < num_entries; i++) { + if (data[i] >= 0) + data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + else + data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + } + } +} +#endif + +/* + * The hash function: given a sequence prefix held in the low-order bits of a + * 32-bit value, multiply by a carefully-chosen large constant. Discard any + * bits of the product that don't fit in a 32-bit value, but take the + * next-highest @num_bits bits of the product as the hash value, as those have + * the most randomness. + */ +static u32 +lz_hash(u32 seq, unsigned num_bits) +{ + return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); +} + +/* + * Return the number of bytes at @matchptr that match the bytes at @strptr, up + * to a maximum of @max_len. Initially, @start_len bytes are matched. + */ +static unsigned +lz_extend(const u8 * const strptr, const u8 * const matchptr, + const unsigned start_len, const unsigned max_len) +{ + unsigned len = start_len; + machine_word_t v_word; + + if (UNALIGNED_ACCESS_IS_FAST) { + + if (likely(max_len - len >= 4 * WORDBYTES)) { + + #define COMPARE_WORD_STEP \ + v_word = load_word_unaligned(&matchptr[len]) ^ \ + load_word_unaligned(&strptr[len]); \ + if (v_word != 0) \ + goto word_differs; \ + len += WORDBYTES; \ + + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + #undef COMPARE_WORD_STEP + } + + while (len + WORDBYTES <= max_len) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0) + goto word_differs; + len += WORDBYTES; + } + } + + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; + +word_differs: + if (CPU_IS_LITTLE_ENDIAN()) + len += (bsfw(v_word) >> 3); + else + len += (WORDBITS - 1 - bsrw(v_word)) >> 3; + return len; +} + +#endif /* LIB_MATCHFINDER_COMMON_H */ diff --git a/packages/wasm/lib/libdeflate/zlib_compress.c b/packages/wasm/lib/libdeflate/zlib_compress.c new file mode 100644 index 00000000..b486c33e --- /dev/null +++ b/packages/wasm/lib/libdeflate/zlib_compress.c @@ -0,0 +1,83 @@ +/* + * zlib_compress.c - compress with a zlib wrapper + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "deflate_compress.h" +#include "zlib_constants.h" +#include "adler32.h" + +LIBDEFLATEAPI size_t +libdeflate_zlib_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + u8 *out_next = out; + u16 hdr; + unsigned compression_level; + unsigned level_hint; + size_t deflate_size; + + if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) + return 0; + + /* 2 byte header: CMF and FLG */ + hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); + compression_level = libdeflate_get_compression_level(c); + if (compression_level < 2) + level_hint = ZLIB_FASTEST_COMPRESSION; + else if (compression_level < 6) + level_hint = ZLIB_FAST_COMPRESSION; + else if (compression_level < 8) + level_hint = ZLIB_DEFAULT_COMPRESSION; + else + level_hint = ZLIB_SLOWEST_COMPRESSION; + hdr |= level_hint << 6; + hdr |= 31 - (hdr % 31); + + put_unaligned_be16(hdr, out_next); + out_next += 2; + + /* Compressed data */ + deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, + out_nbytes_avail - ZLIB_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* ADLER32 */ + put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next); + out_next += 4; + + return out_next - (u8 *)out; +} + +/*LIBDEFLATEAPI*/ static size_t +libdeflate_zlib_compress_bound(struct libdeflate_compressor *c, + size_t in_nbytes) +{ + return ZLIB_MIN_OVERHEAD + + libdeflate_deflate_compress_bound(c, in_nbytes); +} diff --git a/packages/wasm/lib/libdeflate/zlib_constants.h b/packages/wasm/lib/libdeflate/zlib_constants.h new file mode 100644 index 00000000..f304310c --- /dev/null +++ b/packages/wasm/lib/libdeflate/zlib_constants.h @@ -0,0 +1,21 @@ +/* + * zlib_constants.h - constants for the zlib wrapper format + */ + +#ifndef LIB_ZLIB_CONSTANTS_H +#define LIB_ZLIB_CONSTANTS_H + +#define ZLIB_MIN_HEADER_SIZE 2 +#define ZLIB_FOOTER_SIZE 4 +#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) + +#define ZLIB_CM_DEFLATE 8 + +#define ZLIB_CINFO_32K_WINDOW 7 + +#define ZLIB_FASTEST_COMPRESSION 0 +#define ZLIB_FAST_COMPRESSION 1 +#define ZLIB_DEFAULT_COMPRESSION 2 +#define ZLIB_SLOWEST_COMPRESSION 3 + +#endif /* LIB_ZLIB_CONSTANTS_H */ diff --git a/packages/wasm/lib/mtcute.wasm b/packages/wasm/lib/mtcute.wasm new file mode 100755 index 0000000000000000000000000000000000000000..9d13ed6ad354f9e7e946c79ae8d3fc0e7b080a6d GIT binary patch literal 45120 zcmd6w37l2)8~@KacVA}a&NQvla_((0REqYEIceV!lC-aP+Gc6A&$Osi2vGwkJZbMHO(obU2{pU?9=-_Q4ZZrX@~NxG(K zdO_N7y;vLWF6PjN8}i4c-g@K=pY&qASkCj;t4?t)V87unXVtB|+Bfhq-x$^kW+5A zSJT}deX8#DdR)3@c>OfcD*4hK@;zasc_)pTG$ntAq4|dmn>1qL#3`dR%{y$^*!(eL zG(BtLgps4ijGZ{5aLh3E-LO$pCQZ#BQ&2D^Uu#;c%(rrV*)LD7Q|3$i*3o0i+}yl& zneWxj<-R_pUYW1Q&73fG*tju;!=_9toI0&=SiyvuW3*Gt)~>E7EkJ8gqs;9yd7o^W z)-rX%xG|@;YCUYsBW>k*Ub zo1(dLxY3BaRmE0P=YH_%MK$pv94 zB@%U;eoJd@_^E$$vv@4UDk-;{%uJ)0Ci`Wu zJ*Xz;lafnHjBXU25@}+3OgSxQ;xV77UyD+VU&d9wh!P@ia}7fDM~ zOs{1W#$(}=buAi^3$$pu88(d$7@`*S(20m?Y@yh>1+k1i)a8yEVLfDWgF%^LGtxmf z%yiRdMleclm-a}1+-5LxQq*I5&5S-NI+dsP=f!LKXdaiD=?p7peR0c`E1%t_$9XGX zxJ{p3LQL9CDLrG}EK2jC6yB&NNXexQqFv=nmm-}) zX-53qz&HdfkBqepP0W)kV;7(aO7&i@M)7fLUWt*5QOuEVMg3k=zEbL!@(p>HD|LEg zfaPumBCG}i6G*6?yPD~mXwa)vE_M*7BOWwTI#4opb}0eXJj?{ed#Q&3&Nkf9fOIu3 z!$PNGmV!}x0oIYn0i;GNZoWb)goQhg?=j%Z=i*? zy7aK%%gl|r!525SHfk&6v^Hv}R|PhP$*a)R+Q>BBowYWu1nh#5AQ;)aH#;5;^JZmG za3z3?#Oas9na|Xt23m1TpJ-D=E+~U4uZ=35(bBCw>oel=CfC}xBco?lEJeBk-oue_ zEEV+tUa!DUOeHtwalq%cYQ#;CjEtL-V}3R4yohc2duNF%4Ti-K*B0cO+HnjG1!YSo zv+0x6zakpX6w`2QJ*7V(~m&>Wie=gJX}PAzjCCOkbH%G@@lk(QTPBimB2t zDZyoFkx1E(Iv=600JoCx+)SLt;-pvwD9DiZIZ` zBAgMf+1mc&ZSHt1WQN4Q=qe10+l+>!vWV%yY$BOrTAadZ@ej{r6yd;mE8mKH5pxv3 zB3HRZF%<5ytCU8{yQ(I73fh-Yfs9ub)JMz_z4>ju`5nCl&7d=aA%-wmR&T+i-jXN_ z5-fu~70Q&xERS?EsVHfIK(bU-1XL|juT}_^(R^{I1_Pq{A|F&=g3`idd!0DyV-&0K zMXU-=QWWGz!KNF)W=Qxk%uHSwJ3ktzCJHTaD14aV1QX7LW%3@HZCk!lOCuT=0+Uy@{qVx{v0{%q1HinR6xo%5RY?(f(M zid*iNOe7WGLR5<@P)R}|ort=*hYr|HkvgRvd=>7NXbN?6NjiNDnCYRYVTf^tyP*dO z4ZL`pYH0!|Wq%=VQx2vhSs}&h%3^V_m|>~88lpGx{Cp2^Bbgyo;YxlVx=j*XbpwG? zJU>vwQTXEwxGHcVvC|2#HUNB92|gs(s#6s6T6!T`5l>iTtq96JT&!*@2_&%V;Ko(j zkVq8?%_==4yXc@xnnBVd@bTc~Mb&mjMN6s|=JPpps_LV)vSMST#WZ_=FdnHXIzX*1 zDJIe|6l5^LNjOqF)Lb`;8A-mE@pHy9+ZjEQUo!8+sN!Lb0imTjcMy5WBVw)w=%2AI zF&JCRh5pfx8xRq8sE`jPiL6ffE>jZ?SbBR+>IUvIDgZ*qV6ZBHrLNOyiAyOf%3Co~ zDP36=9f;HsQ*m*I=wZ4;w|R7Lv6u^k77I&;gwvEvPm45``W2ES^9-XXY6yr^OS;8? z08!miuq7%$5O>P30SrkyQ|vP!7;jfnf+kYsv`!!n2>5}FypY5=SA;PnIYWdep`|fL zj~?~qmF7z_cs{OnGTjK7uI=+&#A_NIGE@5TVnhRBdMQb3G!>-=Ipd?#Dg=bh6fquR ztE6J80yEWq?G_7UMRA{k=PXIhbu-i_g%nO4lAZVmRBEhmiJ{j31(>OVAQ_3H03g{7 zx*1b96DrirrZ+{WO*D$;V~dfKnAv&yr!cC2d9O>|opju_n6;WccGZH7I z86MLW^Yu*;fAp6f)65`!@I@0H3?)8`Ir;c17^R4#R$cJCvDbFt2bXP?!H0mZ?(@wsO zRK>-GLhFNi74am^$%lcnM{;;3j4I8`RS<()K71?6g{ozHvBh)f4EE9UbN+x_X*iXDtG?G)!OANr9Hren37v$H<0ljK*I zOxI5GN&h0zR0d8y6789MN-~koP3cmQ^c&zZ(d;3mS+Rm1HCJ`-1OriGcZWhPAPWQ=&`cnYF36y8zdr+Y6^`>%rT4{@>OyS zs|glmW-#Dlii|?G(I6JUnmuZ!Lb}K_!zH^M2@e4=_Ogv7 z`DX+yy(rP-ur!&OZBKk5FC?Lbrf*!XMGagS3ysT#yz0T1%9H1oHlFOawGokN6HUZD z807D`CI&N&sIl`7g#I}N2`+9^8L%#_k^m9t@HCvy><5F#dqC44e`N$U;D8Nr;`EK^$4L?CIdSvcZ)LTa&;v~O0EadGIwz* zfuJ)azc83BF*qnHGnQ&rmG_xA4+G-q*l98>mKprZX|l&^F4N>%#AwM=mBUi=E=#jk zN65JnnVcD^8# z@;BV6>z-ml7-bBMd4R1{6ZJ}%qE0-PAr8zlk*okuhri+D#2D%%-)k**nY2sW4bsz;o?96-OG)Ia4;dOSZ<-x15Tw5p0T$q?)OeY#SA&i#szV(yECiDUcN^g3~7L=C%O%B&(B# z5V`myfi9c~blgnRN`V+wI8ui%aMI{WjgX3fDN+Wv$AT1tzhLHWh9@e=Oi8LhqCpt$ zMIxtVP`hYU>%|zDDHPX}TT*&vkzGvWOe!m5XtZvpgH)}TJ6Im$3X~Tm0s+l$CcHXAX?HpJVtJc{iw^tn~1D|YzS*>$mu35FS zu_$@oaz(Pm!*gwCeNn7N=@*9mMK!a`Io&>|N`%qK#-~OUuS#e3lSawXgR(8vS7mPu zWU&}54NMwWBnE-Oi0u>eH2s3tQY+)KR4mbhMW=~PqzQeC_R?_OaC904`N=s5J7z}j zELkFwH>#fwtknJZd6_b%n=V;0Nf+}>vt4-wTbJI;G=0B=71JM=7%3K+gogtPDOiGm zxDgk8k2lhgk`=Fr6cH>my~0n(xDXcNYS1NVQY@KNN+n*E=E`c7BxP~S zFUxCWQQ}=#`e5ElcZ2eg^LA2eNLtO!MT(o3J{XZKilwizP$krla|uO4G*O6w)}#ZX zMx`g|G~JLF3Xog*qe_vhxq%nb6*DGp^AlKR4{egM@LeW#w4mcU$Leq1*(V<^(3m4+Tf@q zchc~niHkjQsTd~D8PIfXO}JLS|8tNph`6T}$kcDXyb+nj?*odgKN*6K3KjJdKcwW=I*B1adJOX}`MF zZoj|CIwBO17%ww9HKu|K)ic#-XkZ2f+AOB1da)vVs4-Qy(hDWixLjH;>=dVIb1rQT z!U1EU_&ldp3A}a_i6#!uX{cdv3|#9ccVK&d+vUidTXzh1wMnUrz6dz=!kC)tIi)6xlU@t6 zt6?&tR9*2TP?0vz%4FI?xhujFvw1J%mHol;7v@_SSFTz{28@bjaeR*8O{1~sEGL(| znp8`@uF{3*1!)Q8%lb0GL7L|(T|%KzRA$1}6`L-aXX*JUS~#$T`gjXZr<_&qES5N( z_d=ElXdvV5(4{Jph{mmWJE_LhS$$zZox+r34mDI-EQ;zQ}r;>5DG1XD__ z&f8c(Txd`(lgTRFPT31#G{pJ4Y&I9Knk$vqMk+%Dy`VT@_7()BRkWg%hcgO6J$OMw zEI*1bGOYRXCe_H1f3!KR7nS>|SwWjDRi(I+DjSTnA8oNEAs)lYs!s(*@%sv2-0>0FABWWwebXiy7(#g3sr)Icd$Ww%l_E#_4-aB?N_#Hp2B zB|YIPwdunQu#PEdlbC_GAGM49;OyBI!Zci9?>EC(v+%vLy+sXJYT}I#yL0;OQvH^A zSf=l2NcEnsNS_lO!a37(PL}pOAPbh*YGRoDWrhi`$tpI=F+>sgw6cXs4)M3$RAK3F zROT^*A<1x)%Ag`DAW_0G5P@`!xY}tU%L$qq3+JO~sS@3V&c;=VH&F9h zi3m`viTP|cHiti|IGMHQ+iJ?LQwfm|asVB3W*`A#U}(J1(=z3g37^_9WS}2KsJf^N z-^4hGFT>rHSw9nR90kqdR_)Y_c{i(2x^mWhDR7oj_@h>#LeCoT2+Wro$(c+k4j!YK zghy5+sC=!kq;}G~u%XIDt+ncF%QewT#g$_zIxWK1;?`t#V)5_H8xJQYRXA>SZbOtyHJp zjfr~Qe_d}Kl~b_#SskBUrXO4@0cB!l{wJ*cZna)oy5Hp5}h&kUsvf|A@w zz#-AfvfPEDc_-6$i2_`fvxo+J@E)82J#s~5Y_S(fnAFYGF6@Q8_fU{0i@0X@ktC;* zBJHU*N?S`n63bfgi4^UUZc?-urqeNuL={UGWNKN7(p(ajfrg+pny0LS#Rh2tIY|f)%!Ux@4{4V;Vi;@31_f|{%tm-Z#gZXN1KEXV zn>f)Lr3H#a6&AZS`Ji_r?4maAV@ZZu-; zuaDaCEw-OwMU_p4Oph5-@&x4w&=vh?C?kp1RwyvyMlc62pH-ska5RrKaC9jwNdSKm z!c6?CsE`!JmLmv0MoU71@MKOY4mlPQqYGKP+)gP+s!A!l+-gq|;ms)`F!67TSZTW` zzYuw(;q-5RHEBOh#(mK&Qi`(8LLD}gV{Cg54x zwG?cvw0A)zTTJOT(qri?aylhSHRAXFbD=?{3K>a;jf4!1V;O(4t(D}^HQCC>LyMu1 zqXu&=AMCV8Mwj&%HrdI1H^eLKf^66J?f;~4eSC+{HEu3#}Q=kQ=D+>h3wq*bjw7flmg3QjD%m||)D_;h- zjmdXFd3XZSGMt9}fziefn=Xf)r2*Wea-A@*>MKD&2pupnP<#F8zvOuI7QaM=db}{7 z_F#!Yi~GfrHxNt0bOdBe9uu(q%)Lz(dL-Xsdfb|Z1W=U%KQJhgIWE1nrsow!y;P+7 z?v*ZjnMNs(h>^?I1RNuG_7(>t<&m)R3dZ}Gvu7hKDWk>*DT2e z@mi&A2XeCYa-I5`Wv^uOaZ{}8{8f|0i9x-nCP{Og?!iJO`{lR$#Tn_c8h2KJNkehN zHn$?lrF5EmY=SFf?NoMIih3cdqTaHt^7J&}F8{Av=1C|XzwapCV>eKyoA~UIsZ5^c ziH{QMCsmqQX2fKnnEnfM}>G=*w)S=Eb5+okwBn+;XN45-}AbT4TXFYG2tI0a_! z2~_z-QM9;7{DIU%s1#Ju9s#`T$*0H;Yy7{=J6K89aMjp{ouiV{sRSx%X^!QkVsR{3 zxzRJp6f2zeKeQm`$Am-NJh9H6*^3q18dbWq$Cgx;C94$ObS%zIkvPC(9XI_g^^)X` z7zHa_YM|71@$#lDl`?-yBxuy^RAb118LE;>jD~Mz^ME~IET%iI_^-`cxGt1xk17)m zHL5a!w=IjJ@}gV<)l|{4B%zK)l9nYze!klC>4dy8mSu=GXd24_e{EStdCQVCUfLk9 z3Cq&TSe6UtS;n>uiM~$F|F~sAX{$UKA8xx)s1vKiK^ZocZIrOKqr#IKCyDq(*8c!3 zzGVfr5?~Rd9bg3=U~yHRy;G3=Ts)&BOCqufSjh3vDz9YRYN8H>yR3RNxin^dlgY%Y zwXwPxJ{$9&Z_pfAN2UXke&t4WOi%b}K7E~;8)6Q9ciG#QRadvKX+{vliTw4=ORIA; zu8kR{Z1#4O7SkYd%0x<)O&h=^OvajKGW%iT5p@wx>4Jn){EJ3Z6h=Yi@q#d#C&6}g zLl(HzdYVqDv=_IiV`D zL2=Tw!D3&$Jg5YZ+C>%DMQAs<`!&secF4J1wb)WS@no(`6qDNU(CRJ%soK0JDT9PQ zhs3e#PP8{iT`DnD_J&I{r2a%}BbBt4DLR|}47;t;12^iaMfl>Z3T;kP?Ct1fStO@9 zPUq1vwTWgW;|#$ivAv8>Ds!aiDO55I>$Iv^lGDNmG!&C~tO4t(R)&nbjG#pexSi)8 z#4dHo_An#cG%-_ISV*LPj+v6fw>H)`6H8j%j3hw}E{B4Zttqn;6O%kX6uMcr`7)b< z5vPoaV)-)p6;6ZOo&9?WwuJ~0)zx(@{o4D-?MwzDos zH(ERFEh`Rv$T=rw*<>&GeWo_)kaZWC2dUJ2`HSZ zDm2n9xKlNu@Uo4pa`*ePo0Rbi%_U@!XFOUE2PhsZ6r@b@$Os$5oVR}7Zt_AN8uTB>5(ca+8HooF%?v+s4m&f8xw~wZIlS2S#7wF zCryYdqWQo6XBw$t8F-#6_W1+BP?*6^P<_SR9Zr!GrtHJiEJ0XUuIRjo40)~I2GW33 zLC(tDqw{jk?kRhgP4HUh6>`>9=Gs8Tv$7BumYHMcTe)_H-@aCzuT2gxJQqIshMeWG4pUMlxPg@wC zgXBV7lURw&+EmgF;cHv3q+j9>q7usS-7c|O+k_ooB&<8dX2!WeQUo<%+yfT$QYvg$ zB66!@T_7EmCa&6?2m&lGL|u-bp+cDCGFR;ZJ}Oiks(6*!iR;v8Iu}7nmM^Mog(bRk z69&QTaY3u8JAI$hI6A<$)%ty-a$T`6BZKa=Q; zp9_U+p@EXxi8e_9opJJsU*!VH#bjR#Z&OrO#j1dYF{}FMSz&jcfT14pF?p8?wJ)wP z%J*C){@$h#$dbj@bcZQc>;9T(jI`8r7pWdaW!+k`P5Qz@fsBu>c2d}A3N5AEWSG2R zbF`F8`BI@%1s$1ZOjPP(mpj?3Y68vYVHW8{;#NP`v(_d+lw?L-Mafbjt+rDyp7gPF zs~)OODHxZ_<$PhLusJxN5rvhJ{eg@KYbY|XLM+l8)if%S!3o$QwnAOf1M0y$*vfX+ zr(M%g0kiHpt zF_p!8qnMWcRBAf8dm1Y!K3Qs!hctZ3i^<^AI0aGq3J&?}_^?7Y=Fm%gDmE@!=ZRT) zu8(adraPYi>#~z!Su7ws7wgtAN|xb@#4bfKGTkmaQt4`>)B*&|y%@OPWHmvSjngFX z>niZWT05f%tE+~CGio?woMbqdmL`WoXNDS*4kd=A)U?&Z{kQ<>9~`@~GDTqUq^vHF z(dnUj?C-Gdcwb9KcuhX3by%_I5Ys2+_r&9=AdeHr1(FM2U01Iy{*Rxzs76utBuX=@ zv6VRLY%k7ipsdWuL2DUwO+5%`)x*EaJKb6)u2$ZnROg)w|5cA1MEnWQQ*&cD*VLM6 z!5GtTc6I9Lm=&8aBs?pM@laGwV3y<>+~UuZ378&Lp^!34M(PKqaQJVd`4sc;?#8mdh+(h zJO_^gGDuFcP`i#|GgCBGow3UR<{H9d`jVwGg~tSCu}B^~(yjFVtYz`czv+rqjk6hV z6qdM$!ub-8uTCoY-! zs4Tmz9nAgdxlxTP(u?riX;w;+m6nHrVCJ&-kL_Xgc%m`RuZOTEr#^@mw#N?+PS2A~ z=4!vOz5lSRLWprxURDQTwx{L`MIc402vffQc!;p{U*hkv{nwnnO8=83VLr4g*?0UH z?54de0gionl3AfR=_Ff`q?0aDw@ZGa1i$OIWEZH55XReG&T@w0S@E;7dsc0Ily&Fr zt~h_ISO=)?t@+taf}5CVApfr9-%Ej4V=wA+p4-bYyx&M`5SVXR5FLJph z-%F24a6ICRx!Nw$2jfJKn9zsVH6XU&l1Jh7Fq0L>V8rC9o)ze|gK?KcWx)o2G%91C^e0%f11GXJsoUNpE9em|3w&UyXf7S#SA=L3 zNe|mvldW?lT9>$9{5NzoX3H}|tRAtlDB)Mt9WJL%X6PPOmS{`2J1Yej#p+T)q$pNT zmc_kh)!g1GGM*@uJsaHNSJ#rph-#vyG|bd#SlRJ#nqcyPAD)NK2@PoC1X4)2crr+B zhR7v0i6<`A2kS4iQGS|$%XNAoD+2P|gM4t?A0U_6RxGrS{aHP!5ib4APo?l{W{J-! zR#n2gg-JcJJ@u1Q<0-AWffXNP&QuNtYEEB*;k#aia z2h~X?Fag|qh#c5SDqMnjR(H&rNxey`!J~sJ$l)-Z+{ey)w9>qXR78}yS#wFYdMpNoe)crco2x&zj6^6i*430P~OCHj!(oLKa>KCnw z_ShJL`e8EgS17lgkz@XUr%vUa<+&FekV+U5 zE>9RyJT%@@zQyN>*QrE>NuQ^N6?~WLrE;ZYkJbAEDV^*Q->lmio*HT3VA~C6lOBkR zlOtf^KAQb1ps+8mR|on8|54bzWElJL)QcJHgC!Wm3IT+$`8|llaV-E4LTd%a4C3 zwt_EYPo5>ei^5C^Aq>)>5Vp-sK?c^Io5=o4rx7xZtd2qRQw?@Am|Y16#)xKcIzwrO z69R98I0Q6g3P;&6&nXJ@tU^&p1`|Tw_94JnyajAxf=L}F{3M`Ac$$l2Nk znW{d@232QWiT!+P4z7ktesl*-z)y&w@e-%r*#s#|?XtIn8zb9ZbAHNAOcrWUp8J=^ z%MW%)1+ru>;SDvWLw@1AnrbyS*ODKH za(*jRahRqoiwL;~32GYHkr<&&7n#h+WKbS|+1MEEz^w4j^?OEY z7R@OMv4*N3S-h*T6j1q-)z@ThAl7kKP9GU2wjT)KtG^5ln@@344d z)nHm~mHf&4MW`+DiRu&YY82(?OE$s$69B{Ou#qj)U9=$Xb#qMofqg^iXQz-zQJHI~ zPYSM4ZVlf^YYeUw|6o{Z0znRE)Q&>j zh-M7+kBWSGsUFQR44LDho?M|4cpQ&MbF9q`Bm<@l|3#5j1vb57L|R3aBR3@{ly3`S z$#nzNZfn@;DD$1rh6eJ(Q#3l;*rWdLQh&Fpznj$Gwd(Im^>>N-J6kRvE`J8fpC0n3 zqx>=DPp14qk^K2v1r%_FC|j0ecH5i&xH&90hru=41J()cv3JSQ9^at708By$4Ty$jZXjBhzhu(nscwcRk45r zviFz*<>}B0{lia|Z?jh6Hdke3`zoFkCCkqTRVdw6v2=HZ(mfRml9u}`+*Kf@*G7F6 zO4lk}hi#Ydvb(ajk}E8a(SP4E*Hs`FN?&R3zjN{8W?Hb`e0;HoQ>eoSZl6-xJ3 z)|raKr&N`VC$3PsuVULP4zs_q(koF_C05n;*pd0$aj#TWm0DHvV{P^rRaL}Jr8Zls zRkg%sDyZrh6jxMLr8ZlMsw%OnF2~iX`W#oQA_=Ids^e)@1CFaz4LYt?HRRv5Djp1} zI5PjBuNr+^t!n(gYgN;at5r=su2vdX#MewgBy;oZLK@Lwspt9Y+H{V zU)$Pvd~NHYf7!OS9$(wqetd0f^S@$SKQOz6UQl~(vW=bpynX%13|K0Fi;(%wyZ^k2 z9b~F2H3+Rj4ZDwnl{JwHnAiU|yoq8W_Lj8`bvGf@nUa=Kd2%LKR!d25UZ!LJRo!k= z)xEE*;n;O2CoW~`PR{en)}0KD|F?C^vn?ope_6=cbtmV>W$I2&`pVXwOexCO{dc4X z|LsrcX&tr*p7wz3nS*%b36e-_D90Rv84Gf7t}N@g)$*Zi(~kxqA`M7H$WHk+Ud6cc zs`iN%2!Cn_+sDpw;i6c|U@;@orp!I;5m5_RJhm&AaqKPfpQy-6sl4zIh13}|0}%df zsSpNNZ9$VYg#)rfEOe-m9!j*|quTGG{T^!%%SNeITUmFL_R<3R;RShGhW|^)kWR_g zQy%G*PeR(s(_p+fzx`-8-`*h4l3FB}eT6AP>%qdazbq!ixU6x|NbZmyS5Eek8d6wE zgepr(*zEx|o{AD{sfekRBvVlOjpF?Zs3k!rm44MxZj!GO=|$;RTNO)IP=EQo zf8iJTOQ7QKTJ@KNi1+Y|{Drs^evv;z7?&KI4Er&ce|JlJ=1-PyUyf z3cX7@7N^BYZ|JnbNZL|q$-xTeq?dR+)+__D%ZHggYzr zqoRsJ6^ag2&~$n3)d4O=XR|(4p=f@w!BI)&DhgFnbR{aP#ERM;JJc2JzM?HxrlQKM zsQIx&T~S394YabMuFQ%${=My2w4%yTRf&o!+BM_R=lEGsm*ZzeY>4{ztZ2aTv!X%A z&x!{An^rXX_*v1!<7Y+V|4l2Je*COx>hZIp+5e^$LE{~_kOl*J++tdZ?B4y?f*R~# zDVORbF#l9kgBm<0gZZbz8q4f|o74SsaSht;xY-hn<8iYk_`rYBmS6*qn=L^F9yeQJ z`^LX$OVD)3&6Z#okDD!RKAyJZ>>E0;H_85yO~U^DL-xcLq(sre`%yN$-#=te%t^&t zC(0MrkMei@qc-If4(s-Jb8*GCNCRN}Y%_E2#W0RlvP}MyEf>uMWQtv(JO5R!JZvnr z!u8n>Cs8Y8Px%7bgBGe-t2`0#Kh+A!Ahp8s*=C=pHB?Ehfr_;%4&>Or{vA~Uzn5gC zV7l0UXp=gMv^$o0 z5jafQA$f>b`L&S)ysG~*H=DP7$pxYLh8}Pp$Ye{XY&JY3J4V@`bwFpgB4l5LpJtGa zq4JaL{8Bo!B3nUOw2So5VsE(oz(1Sq`Cl8QTk`Y^8^ECvWy7LuuSY?OwSp&4vR3RM zgfz6E5!I-r_08fF7xEaUm1h6o3%^rJo&1~*4+pCs&yicyLq=@4WD~1xCMLfysmu04 zyPfi&IG6DwNNUfWv@4fwuR>n3qgl4la;@sP$#X?$MfgnU%Fx!K0e%?9A3DnBLi?8` z{DD9)7z%|sQaDmM(l{c{k?tHB&QZlVGAkTaD;(7-9MvlvHU8$P`8P+czd5r0^HE!~ zt_X&Xx zoIhL(&Umi=t)o7xa@E;yws^4M$I!CguQa|p_nWNiFZiItle2#JE$;r}3G2swS$)-c z@3eXJqF>T3@Bdn}`=)$f@1|iNpZ-koQO#PL8gdyfAHS278oF1PgMG1vi^I*~;n18z zzLB$HEn+DN+@z#vn$9%_;zHbr+~h+72w!XINKGV)oPt=$2xK0z1<`fhBRYlX6rxjz z?nV4a7>OV?5EE&RJIW6LZl68h_s^$QXR=c>LK-!#z+f< z-R#;K$l1s!qySlf+=@Jhe1;foEX>C8v+ZM!bIi4mV&{R5vE#;1h%=g#CNqYN9^)nhCL<*snl+2Q%$Y+sii_z1%BStp za9T~1rMc2lX{EGKwN6^58l@Vf+M-&qLLFDA<4Sd0WgjcmakYJ1;~cA;W3_W!>m1i5 zj_VW0n#6HK^0=||xG8z89muaORu5H)q-SV;U%(smgxq0Qijk_PY1YrxL%;RIOB~~B z{tEVZ4h8*GUIqSm5#<6hc$K4kSvqb@$9)*M4ukt#T7qzaOWR7I*G)sY%VO{5l*h15psAa#*?h>1jz7*Zc;fHXuJ zAtxZ&$ce~FNMoc4(iAxvX@)dMPC-sZS|BZvR!D244bm2AhqOmJARUoT$Z5#wNDg8l zoslz;E=X6T8`2%=f%HUrA-$14NMEEM(jOUsoQa%;oQ(`b&Oy#a&O-(v=OcrW3y>km zg~(837&06gfs8~(A)}Eo$XH|?G9HG3U+={G2ZbNQI)+2WycOrKocOx5+dyspP z`;d*u{m28zgUCb3!^k7ZqsU{(E4%v!4kGz1qh-^b% zLS9B*LAE2WB0G@Rke$fu$Q#I;$S&k9_fgnzDB-5_9NdS-yz>42atov56F+mA>=3IXXF>;F!C$%8}d7H1o;E` z6FFLKl;a2jar}n%l4MHeSR?S-BYT%U#$OJjcYvD`7%mA{pN%RrUe!> zsg*T(@Y=IFy#3&bpEX-}_UqcW=8^j^h`;i5yIy?;y|(lFyM8~TZ;Ok&{%RBt=y=l| zXWp7~`IR@c{b|!9??)p?e|+QBaFzYTR&}Yqb^Y8$W0@Zf z4SlfZ)b_!i-UdUSeRIReE3V!<l*wQqmug?%3u z*6aP$vZYf`dw6UQ*UzaVTAla!qAhi9{N(WJ^a<}>w(|PAi@(}&PWH#kZ@whunJ+qz zx2|oPwsGK-)le{cK#leQh|(j7iR2(9NEXr>QOAJwht52Ej5YVUb=7NKGxUV^ujJm{ z;b7Mr>fJr)w+&ahKI?tXZ_Ul0Cl%duMbY*p4f_~B9eL;T{_nhU((@%?w;iMY!cGQZlljcdQQ^vZ{3tUvIB@5h@zozU;qL+_7y>uHa--pH1(Fa7iOtv8%~ za?7>n{CLZ$pDguE`{w6$?e;GS-Sz$O9PdqeL(hy%e(Kky(bGD4KfLB%&uJfad+Lk6 z4_`X)3eSaG^QP?bx{p>JFucLZ!N4pOJIPMK``T?y1iEHZECqiLdPiHNX7e zyKOVN2WQ{>Ra)oSFV$UmSO#RlqLM(5ZyTSwVQPBae))qZ{(dy|k8|5zoL2X(dpn)b zu4(Jh@1+H!w+|Wr`oIgWZ+~*@2KSE(hSR=0@2WSh`R<_`ued&||MQzmZZUU{n9%#8 z!PRPI)DAvz)u*4Iw083udtx2yRL{x?r36y%d2!hfkKge~(`|Rw&Ahng!Snj~rrr1C zhu41h$ME7?Z)&9fvgxP%A8JjB@1&bge6e@48|(bo{DIy3&svnS^Rq1%O|14{ z<7*$EdgpIFR;I^Bf7N~K@*{WDI{m=fcbm*P_xlwsVlVdVxp>R#ZQuLzmY0{e4((d~ z+Pt4`&;DrUZH+E_^!;_u+;jTg8`?KXxpMk|>$eY!kL)&g;ImaO{dwPG@2(j7OS_YQ zy6?I9PnTSH)9kBuPPnt_TT>p)X!u%(Yd36LKc_}y@~AcYW-OJqUH=7RCSU!<+%Jya<$rBc^oxB}zkl!gFQ3@adP=Wdft$X5d-lc??|Wd~DW}vw z{iNnAZ+l_$*jrk)+0?J__Lt`$*|>AXo~rMzZ_#VD``|C{@3&G1yjtAs=PRmT`gY%< zz8k+kV_@IAwtwutX76VYzkXxZl6arT>NVL?eCC*L?yra5aAB+EdO_jh-%hqdzYnf6 z?!mL(_~6PG=SIG&(*5m8%SXI(-OTXKZ7$0^eEuaJt9ldn>$CaK)u|S4`01u)Jv(MwSEn4!)9+~a(Su?Cn!OLbyJE|^qem=!W!BV3PJOFz z$iyC<&wFg=s8KK6HhcErg%@4)-LeRtn4ii-AM_wvh^cbz}K-VHCkbV<&fIlnFb?YHl% zo`3!W-*oSO*-Hlw{2sdJ9&7)uT@Q}C<(6T8+;?Bsuh*=p|IE5|XaAC)zxSHWo3Fp% z(MP8}`^O)j)V}JfPWLuy)F2$Z-AAa%0tIw)mzuTDSp1WaajT+6fueqkp%^puM z?YirlJ?!(P1~M`lv@R*hy8hEoKkoF+H?KB2;e>t@9(bVm)oIhdz9|sM$Xc+V&W%l) z3@WTutL4dAS*@E+p1k{k!GjfDNdis@D zF75X8)8pQ2*Dk+zuU;!Z>eFZSu0ew~?tAUEi+krSU3c~W{P*9#t#-y4 zi=OY>_qumlv>1EN#TTcq?AmqdYrp>bRm3pd-s0jv=M5OJ_CUvu)4SYsQ^P0jxTE)H zXP$Y>kGI}>;h#A-+1GhNw2;-uTMB!BeP1CMy>Ymf3DfEVNZXuYE`RSx^!8x zy?XVN+il%?)p_gJpYy}qxku)A>NKr&Rwyh!`(}kgcfw`w*2MqzI*Y+#~#alGcz;0<3}H@pD}dk zrcXnmYIQ#N;GUvAdp_uJ>Zzm8ZQs7IUoeQxBFLq zf276}PfU4h`t$=!8aC`T<>Zrx+Jc>I%3-fMgK z@HbUfuWtNcdU~wIgb8mv`QCeXPQC0h-z6(owpf4t^-UkCTeo9u@#3mie)ZK$=8heU zd!BR7T}QLC2TcF?-Z zS=RjLuD$lejZK>lz9=oNZu-WJvtAiE@Qx!-J~`>_YSo&wC;Sf}{GUzue~$2f4dMS4 z!v8_S|J{WDs|f$+5dMEA{9i%%Z%Ft*Lim4&@Lxjsznt*jlkh)|@V}by{|VvWApB<# z{%uw{#O$I2NC{j68;wx{vRd$*C+gcO87sI@P88F z-$(d&5&o|u{NG0S|AFxT0O9`$!v7G$e`CV`m4yETg#Vif|F07M-y;0iBm6HV{NF(M zUrYEumGD1}@ZXN`e;483OZY#N@c%2}zZ2pAUc!Gj!vDjBe-Gh*3gQ1K;lBale-+_> zJK_H#!v9l*|0RU~wuJu=2>;y)|6dXQUlJce_+LQy??L$AK=|)R`2U^oe=gy_F5&+K z!vAQ(f0Xe5I^n-P;s1WZe;VQc4Z{D8g#Z48|62(E6A1s+2>(IC|L27NGYJ262>&6% z|2>5N#|i)22>%xo{`(OApCtVMLHNIk@V|-h|2*NpCE>q_@V}ezKc4VEitzt3;Xgw7 z&msJqg#S5&|HFj;bi)56!hdhV|Br-tj{!&m;VwM)+?+_#a02&nNtc z3IEFo{}$mtkMREy;eQR`e+A)x1mS-w;lGgZ-x>I?2mWJ#|2p756!_-?|E|D)5b$>a z|KEWBB;a2J{QCg^cYyyZz`qOdzZLio2L4&VKNa|I2L984{{i5?3i$sD{D%SmHNbx+ z@IN2;F97~g;6DNQZv+090RI`l|1{vA2mJd3|9gP{2H@Ww`1b<-rvU#Jz`q#yUj+Q` z1^#P+e`DZ39{B$O{3inc!@&Pd;BNr`%Ypyhz&{)KKMVY40e?U6Ukv zp921u0{;tv|1RKP75M)S{J#SJX8`{jfq!Sf>g|BHct4EU!5{|3On6YxJ3_}>itn*smk!2c}Z{~7SF2K*lf{yl*IXyCsb_@55^ zn*je6z`q~xe;xSW0{lb3e;)Av2>4$H{GS2-Hvs=Df&X^k-wpUz0sfBx|6hRreZap2 z_+JJ5n*#p~;D0Ufp9B0y0skj~|8(GA1Nc7*{I3K4lY##i!2dPizYq9d5BysL{{Zlx z4g4Pf{`GX+rWP#@b3%!-N650;J*a; z*8~1%0{^dpe=Fc$2>dPJUkCWV0sPMe{@sE92;d(E{+YnPBk(@~_`d}FKL`Hz1OJbK ze*y5{0sPwl|5t(kUBG_|@c#h#{|NlI0{@o4KL_|f2mBiX|DM4AYT&N}{|ABpUf{n4 z_%8(hj{yH6!2dkpe+c+nz`r{1KLPmP4g7Ba{=WhL&wzh(;9ms%w*!9z_9!2fyR-v;<+1OI1$zYF+R1OBf8{~5sF2mB`h|M!8v2l%%H z{)e;)9k4E&?O|3lz^8u0%D_zwjBTYXidBFcW;D0^v-wgbB1OJPFe+KZs3iz)D{xRU61^ffRe;M$91o+nl z{s)2oG~jHh|0v+U3i!VO{CffaCxQR3 z!2d+xedqy{~Ex54e;*( z{2KxP^MU_$z`sB6zZ&=-1^%0We^ucBCGejD{BHvO8-f2i;D0*szYX}`0{r^{|M|dw z1@K=F{N2ERKk$DQ_+J71`vU*(f&X2={~F-`I`EGJ|0ckH4DcTc{Fej&#lSxV{KoY0{@P{KLz;T0sO{ z1i+;b04WdvPeB0O4*@U`0$>0HfC~cPP6&WG5CAnH07gOpd;tM)9RxsE2!Il0^lVGfDi=0eh7eZ5CDHb0DKJr@C*dNFAxCNKmc3-0q`sYKy3(sdm#Yg z5CDTA04xXq0|KBw1i%mofI|=fEg=B>5CB&}02Dv~oDBhRIs`x%0^lqNfH4pNLm>dN zApmZM07!!Xco+g800Gb%0^oWGfKCtqjUWIfKmfc70dNxpKo$hRjSv8Z5CA7b05pXF zcmM+6I|zW^Apm}c0O$(=FaiQ#2L!+@2!PWd0P-LJRzU##1OYG}0zk0a1Oi|<1VB9q zfNc-}-$DR94FS*%0^mIefZh-QA3*@@f&kbD0niHqAQuAQBnW`dApojD06Y%?@D2pP zIS>FVApl;30Ej>Ucp(7hK>!?p0O$e%@B{?FXAl5CLIC^;0k9bYU?~K^JrDq!AOLzm z0JtFlK866;0s$}$0$>ybKywIyS`YxK5CD@P0Qx`xWI_P6f&gd+0q_X~z%38}+aUnj zK>(Zw0q_F^z69S+k1i%akfKMR+>OcS#K>&1s05}%{pdSQ4T?l{=AOM08023hq_Cf$$ z2mvq|0^oZHfZHJe?tlPT2LaFq0$>>gz^M=bXFvd)4*~Ef1i*3#fYA^DZ$JR-fdF_2 z0^m0Y03QTELkNKTAOJ3d0Qd?5paulMTMz(CAONO70OUgeG=Km&2m$aS1i+aP0QDgN zW03j*L>2mlWRfDQqW0ReC;1i&x| zfL|d1c0vGL0Rb=w0^kG)fX5*K+Cl(SfdF_A0-yy1z>^RFQy~B@fdE(!0q_U}Knw!l zN(cZG0-z@Zz)=W*=@0-jApkCi05}W*Pyzu^9RlEe2!K-{0A7FqcnkvIIS7D_5C9iJ z0Hi|zyaEAm1Onh~2!Qqw0MLJYE6{&WL;tmd{_6$( z*9ZD<5cJ<`(0@Cj|GtO*y9@g7cj&(}p#SN>UorIG0O-Gt z(0?~U|J?!ocP8}TtprH=zGsh5ide|5bth+YkLW4Ek>s^j{a~zv|F`TcQ8fL;uZ%{_6z&HyZlyLg>Gm z(0>Kcf5V~w=0X4UhyL3Q{TGA&I|=%41@zw%=)Z2zf8Rm>JqG=k3H|pG^xshEzYz4_ z2he|ep#M&V{%a5Y7li)n3H|4V{%ZjJHw60cS?Iqvq5n2O|BZzHy8`;}YUsbc(0?e{Vtm^@IMq6Z&r=^xwJAe}|y|zJ~rA1N}D>`tM%oKR@)} z6VQLtq5m2}|D6o|R~!28cIdx{p#NTg{@Vxr_aXFOA@pB8=)d03e@{XGErb4B3jH?~ z`tLO8zlWj!#zOz~fc|qq|NRX8mkRwi0{X8N^xt{Ve~&}|ErR~r0{vG9`tL^QzfYk5 z4nzN~hW<;3{+j^(_a5}$Wzc^sq5rOj{;Lc9w;1~GE9k!+(0}JZ|7An}eGL7#9QyBO z=)X&#|5Bj;o`L@R0{X8r^xt^sKMVTrTIj!~(0^&re;c9y215Tm3H?_M_>Thqvw{Cb zz<(C-{{i^t0{xX z4){L?{4;_7N5Fq5@DBn14}kw3;D0LcZx8%~z`rN(_X7V0z<&tve-`+^3H&zz|B=A| z3gCY=@ZSskX8`{%f&Vt(-v;{||wGA@HvU{Cflcr-1)5;J+03PX+#` z0sn`A|5)JP1Nggu|Iffb75I+;{;h!jdBFd1;J*m?Zvp;wfd7ra{}bSU82GOS{^`Jf z0`Pwi_+JM6R|5a*fqz}#zZm#`1^jma|8szUHt_!#_%8?kHv|7mfPV_`e+Kw}0sK1y z|M9@z0{+(m|E9n{4ft;a{sV#klfb{)S%LUl&0F8r_L-bpZ=CnzJHOBVtlj+Un;l); z^0G6!Z2jn~R|d{ndFt)Y*6;LPpQ}Tie|xT<|Kyv;X1~|qfuaj;={afH>kab{|2%rp z_|?^RtsAoa!ZD9rIdN)_0e^h^`1?bvb~|tVr8g{G@$TK)@=b3Bf7v_8^Vox3y*r!K zz5M0V(~K?mJ>}A?EX{3yMY`30-0rVe{qWk30}p*saLtCo*q8UTy7<#y+w7}7?c>Sa z57x@7bK>)tOnG!<>RmOS&N_1LX}w=~V}^O%2b=$#{_5aE^*+3##S-6(r`(?rIsb%` z{lkWzS$t>BhkK;V=G#xi_%E%ED f3mS(;PMet9bkdkfQ}Smt_7_YTH@WGU>4pCXO6?<3 literal 0 HcmV?d00001 diff --git a/packages/wasm/lib/utils.c b/packages/wasm/lib/utils.c new file mode 100644 index 00000000..8ba2cd36 --- /dev/null +++ b/packages/wasm/lib/utils.c @@ -0,0 +1,137 @@ +/* + * utils.c - utility functions for libdeflate + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "lib_common.h" + +extern unsigned char __heap_base; +static size_t __heap_tail = (size_t) &__heap_base; +static size_t __heap_mark = (size_t) &__heap_base; + +#define memory_size() __builtin_wasm_memory_size(0) + +#define memory_grow(delta) __builtin_wasm_memory_grow(0, delta) + +enum { + _mem_flag_used = 0xbf82583a, + _mem_flag_free = 0xab34d705 +}; + +__attribute__((visibility("default"))) void* __malloc(size_t n) { + n += (8 - (n % 4)) % 4; + // check if size is enough + size_t total = __heap_tail + n + 3 * sizeof(size_t); + size_t size = memory_size() << 16; + if (total > size) { + memory_grow((total >> 16) - (size >> 16) + 1); + } + unsigned int r = __heap_tail; + *((size_t*) r) = n; + r += sizeof(size_t); + *((size_t*) r) =_mem_flag_used; + r += sizeof(size_t); + __heap_tail = r + n; + *((size_t*) __heap_tail) = n; + __heap_tail += sizeof(size_t); + return (void*) r; +} + +__attribute__((visibility("default"))) void __free(void* p) { + size_t n; + // null case + if (!p) return; + size_t r=(size_t)p; + r -= sizeof(size_t); + // already free + if (*((size_t*) r) != _mem_flag_used) { + return; + } + // mark it as free + size_t flag = _mem_flag_free; + *((size_t*) r) = flag; + // calc ptr_tail + r -= sizeof(size_t); + n = *(size_t*) r; // size of current block + size_t ptr_tail = ((size_t) p) + n + sizeof(size_t); + // if not at tail return without moving __heap_tail + if (__heap_tail != ptr_tail) { + return; + } + __heap_tail = r; + while (r > (size_t) &__heap_base) { + r -= sizeof(size_t); + n = *(size_t*) r; // size of previous block + r -= n; + r -= sizeof(size_t); + flag = *((size_t*) r); + if (flag != _mem_flag_free) break; + r -= sizeof(size_t); + n = *(size_t*) r; // size of current block + __heap_tail = r; + } +} + +void * +libdeflate_aligned_malloc(size_t alignment, size_t size) +{ + void *ptr = __malloc(sizeof(void *) + alignment - 1 + size); + + if (ptr) { + void *orig_ptr = ptr; + + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); + ((void **)ptr)[-1] = orig_ptr; + } + return ptr; +} + +void +libdeflate_aligned_free(void *ptr) +{ + __free((((void **)ptr)[-1])); +} + + +#ifdef LOGGING +char* __debug_log = 0; +char __debug_log_pos = 0; +__attribute__((visibility("default"))) char* __get_debug_log() { + return __debug_log; +} + +void __debug(char* str) { + if (!__debug_log) { + __debug_log = __malloc(1024); + } + + int i = 0; + while (str[i] != '\0') { + __debug_log[__debug_log_pos++] = str[i++]; + } + __debug_log[__debug_log_pos++] = '\n'; + __debug_log[__debug_log_pos] = '\0'; +} +#endif diff --git a/packages/wasm/package.json b/packages/wasm/package.json new file mode 100644 index 00000000..94082a58 --- /dev/null +++ b/packages/wasm/package.json @@ -0,0 +1,29 @@ +{ + "name": "@mtcute/wasm", + "private": true, + "version": "0.1.0", + "description": "WASM implementation of common algorithms used in Telegram", + "author": "Alina Sireneva ", + "license": "MIT", + "main": "src/index.ts", + "type": "module", + "scripts": { + "test": "mocha \"tests/**/*.spec.ts\"", + "docs": "typedoc", + "build": "pnpm run -w build-package wasm", + "build:wasm": "docker build --output=lib --target=binaries lib" + }, + "browser": { + "./cjs/init.js": "./cjs/init.web.js", + "./esm/init.js": "./esm/init.web.js" + }, + "distOnlyFields": { + "exports": { + ".": { + "import": "./esm/index.js", + "require": "./cjs/index.js" + }, + "./mtcute.wasm": "./mtcute.wasm" + } + } +} diff --git a/packages/wasm/src/index.ts b/packages/wasm/src/index.ts new file mode 100644 index 00000000..21a5080d --- /dev/null +++ b/packages/wasm/src/index.ts @@ -0,0 +1,213 @@ +import { loadWasmBinary } from './init.js' +import { InitInput, MtcuteWasmModule, SyncInitInput } from './types.js' + +export * from './types.js' + +let wasm!: MtcuteWasmModule +let compressor!: number +let decompressor!: number +let cachedUint8Memory: Uint8Array | null = null + +function initCommon() { + compressor = wasm.libdeflate_alloc_compressor(6) + decompressor = wasm.libdeflate_alloc_decompressor() +} + +function getUint8Memory() { + if (cachedUint8Memory === null || cachedUint8Memory.byteLength === 0) { + cachedUint8Memory = new Uint8Array(wasm.memory.buffer) + } + + return cachedUint8Memory +} + +/** + * Init the WASM blob synchronously (e.g. by passing a `WebAssembly.Module` instance) + */ +export function initSync(module: SyncInitInput): void { + if (wasm !== undefined) return + + if (!(module instanceof WebAssembly.Module)) { + module = new WebAssembly.Module(module) + } + + const instance = new WebAssembly.Instance(module) + + wasm = instance.exports as unknown as MtcuteWasmModule + initCommon() +} + +/** + * Init the WASM blob asynchronously (e.g. by passing a URL to the WASM file) + * + * By default, will try to determine the best way to load the WASM file automatically. + */ +export async function initAsync(input?: InitInput): Promise { + if (wasm !== undefined) return + const instance = await loadWasmBinary(input) + + wasm = instance.exports as unknown as MtcuteWasmModule + initCommon() +} + +/** + * Deflate some data with zlib headers and max output size + * + * @returns null if the compressed data is larger than `size`, otherwise the compressed data + */ +export function deflateMaxSize(bytes: Uint8Array, size: number): Uint8Array | null { + const outputPtr = wasm.__malloc(size) + const inputPtr = wasm.__malloc(bytes.length) + getUint8Memory().set(bytes, inputPtr) + + const written = wasm.libdeflate_zlib_compress(compressor, inputPtr, bytes.length, outputPtr, size) + wasm.__free(inputPtr) + + if (written === 0) { + wasm.__free(outputPtr) + + return null + } + + const result = getUint8Memory().slice(outputPtr, outputPtr + written) + wasm.__free(outputPtr) + + return result +} + +/** + * Try to decompress some data with zlib headers + * + * @throws Error if the data is invalid + * @param defaultCapacity default capacity of the output buffer. Defaults to `bytes.length * 2` + */ +export function gunzip(bytes: Uint8Array): Uint8Array { + const inputPtr = wasm.__malloc(bytes.length) + getUint8Memory().set(bytes, inputPtr) + + const size = wasm.libdeflate_gzip_get_output_size(inputPtr, bytes.length) + const outputPtr = wasm.__malloc(size) + + const ret = wasm.libdeflate_gzip_decompress(decompressor, inputPtr, bytes.length, outputPtr, size) + + if (ret === -1) throw new Error('gunzip error -- bad data') + if (ret === -2) throw new Error('gunzip error -- short output') + if (ret === -3) throw new Error('gunzip error -- short input') // should never happen + + const result = getUint8Memory().slice(outputPtr, outputPtr + size) + wasm.__free(inputPtr) + wasm.__free(outputPtr) + + return result +} + +/** + * Pefrorm AES-IGE-256 encryption + * + * @param data data to encrypt (must be a multiple of 16 bytes) + * @param key encryption key (32 bytes) + * @param iv initialization vector (32 bytes) + */ +export function ige256Encrypt(data: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array { + const ptr = wasm.__malloc(key.length + iv.length + data.length + data.length) + + const keyPtr = ptr + const ivPtr = ptr + key.length + const inputPtr = ivPtr + iv.length + const outputPtr = inputPtr + data.length + + const mem = getUint8Memory() + mem.set(data, inputPtr) + mem.set(key, keyPtr) + mem.set(iv, ivPtr) + + wasm.ige256_encrypt(inputPtr, data.length, keyPtr, ivPtr, outputPtr) + const result = getUint8Memory().slice(outputPtr, outputPtr + data.length) + + wasm.__free(ptr) + + return result +} + +/** + * Pefrorm AES-IGE-256 decryption + * + * @param data data to decrypt (must be a multiple of 16 bytes) + * @param key encryption key (32 bytes) + * @param iv initialization vector (32 bytes) + */ +export function ige256Decrypt(data: Uint8Array, key: Uint8Array, iv: Uint8Array): Uint8Array { + const ptr = wasm.__malloc(key.length + iv.length + data.length + data.length) + + const keyPtr = ptr + const ivPtr = ptr + key.length + const inputPtr = ivPtr + iv.length + const outputPtr = inputPtr + data.length + + const mem = getUint8Memory() + mem.set(data, inputPtr) + mem.set(key, keyPtr) + mem.set(iv, ivPtr) + + wasm.ige256_decrypt(inputPtr, data.length, keyPtr, ivPtr, outputPtr) + + const result = getUint8Memory().slice(outputPtr, outputPtr + data.length) + wasm.__free(ptr) + + return result +} + +/** + * Create a context for AES-CTR-256 en/decryption + * + * > **Note**: `freeCtr256` must be called on the returned context when it's no longer needed + */ +export function createCtr256(key: Uint8Array, iv: Uint8Array) { + const keyPtr = wasm.__malloc(key.length) + const ivPtr = wasm.__malloc(iv.length) + getUint8Memory().set(key, keyPtr) + getUint8Memory().set(iv, ivPtr) + + const ctx = wasm.ctr256_alloc(keyPtr, ivPtr) + // pointers are "moved" and will be handled by c code + + return ctx +} + +/** + * Release a context for AES-CTR-256 en/decryption + */ +export function freeCtr256(ctx: number) { + wasm.ctr256_free(ctx) +} + +/** + * Pefrorm AES-CTR-256 en/decryption + * + * @param ctx context returned by `createCtr256` + * @param data data to en/decrypt (must be a multiple of 16 bytes) + */ +export function ctr256(ctx: number, data: Uint8Array): Uint8Array { + const { __malloc, __free } = wasm + const inputPtr = __malloc(data.length) + const outputPtr = __malloc(data.length) + + const mem = getUint8Memory() + mem.set(data, inputPtr) + + wasm.ctr256(ctx, inputPtr, data.length, outputPtr) + + const result = mem.slice(outputPtr, outputPtr + data.length) + __free(outputPtr) + + return result +} + +/** + * Get the WASM module instance. + * + * For debugging and testing purposes only + */ +export function __getWasm(): MtcuteWasmModule { + return wasm +} diff --git a/packages/wasm/src/init.ts b/packages/wasm/src/init.ts new file mode 100644 index 00000000..794bf8f9 --- /dev/null +++ b/packages/wasm/src/init.ts @@ -0,0 +1,24 @@ +/* eslint-disable no-restricted-imports */ +import { readFile } from 'fs/promises' +import { join } from 'path' + +import { InitInput } from './types.js' + +// @only-if-esm +const __dirname = new URL('.', import.meta.url).pathname +// @/only-if-esm + +export async function loadWasmBinary(input?: InitInput): Promise { + if (typeof input === 'undefined') { + input = join(__dirname, '../lib/mtcute.wasm') + } + + if (typeof input !== 'string') { + throw new Error('Invalid input, for Node.js pass path to wasm blob') + } + + const module = new WebAssembly.Module(await readFile(input)) + const instance = new WebAssembly.Instance(module) + + return instance +} diff --git a/packages/wasm/src/init.web.ts b/packages/wasm/src/init.web.ts new file mode 100644 index 00000000..51f09013 --- /dev/null +++ b/packages/wasm/src/init.web.ts @@ -0,0 +1,42 @@ +import { InitInput } from './types.js' + +export async function loadWasmBinary(input?: InitInput): Promise { + if (typeof input === 'undefined') { + input = new URL('../mtcute.wasm', import.meta.url) + } + + if ( + typeof input === 'string' || + (typeof Request === 'function' && input instanceof Request) || + (typeof URL === 'function' && input instanceof URL) + ) { + input = await fetch(input) + } + + if (typeof Response === 'function' && input instanceof Response) { + if (typeof WebAssembly.instantiateStreaming === 'function') { + try { + const { instance } = await WebAssembly.instantiateStreaming(input) + + return instance + } catch (e) { + if (input.headers.get('Content-Type') !== 'application/wasm') { + console.warn( + '`WebAssembly.instantiateStreaming` failed because your server does not serve wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n', + e, + ) + } else { + throw e + } + } + } + + const bytes = await input.arrayBuffer() + + const { instance } = await WebAssembly.instantiate(bytes) + + return instance + } + + return await WebAssembly.instantiate(input) +} diff --git a/packages/wasm/src/types.ts b/packages/wasm/src/types.ts new file mode 100644 index 00000000..9828a771 --- /dev/null +++ b/packages/wasm/src/types.ts @@ -0,0 +1,24 @@ +export interface MtcuteWasmModule { + memory: WebAssembly.Memory + __malloc: (size: number) => number + __free: (ptr: number) => void + libdeflate_alloc_decompressor: () => number + libdeflate_alloc_compressor: (level: number) => number + + /** @returns if !=0 - error */ + libdeflate_gzip_decompress: (ctx: number, src: number, srcLen: number, dst: number, dstLen: number) => number + libdeflate_gzip_get_output_size: (src: number, srcLen: number) => number + + libdeflate_zlib_compress: (ctx: number, src: number, srcLen: number, dst: number, dstLen: number) => number + + ige256_encrypt: (data: number, dataLen: number, key: number, iv: number, out: number) => void + + ige256_decrypt: (data: number, dataLen: number, key: number, iv: number, out: number) => void + + ctr256_alloc: (key: number, iv: number) => number + ctr256_free: (ctx: number) => void + ctr256: (ctx: number, data: number, dataLen: number, out: number) => number +} + +export type SyncInitInput = BufferSource | WebAssembly.Module +export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module diff --git a/packages/wasm/tests/allocator.spec.ts b/packages/wasm/tests/allocator.spec.ts new file mode 100644 index 00000000..790113c7 --- /dev/null +++ b/packages/wasm/tests/allocator.spec.ts @@ -0,0 +1,21 @@ +import { expect } from 'chai' + +import { __getWasm, initAsync } from '../src/index.js' + +before(async () => { + await initAsync() +}) + +describe('allocator', () => { + it('should not leak memory', () => { + const wasm = __getWasm() + const memUsage = wasm.memory.buffer.byteLength + + for (let i = 0; i < 1024; i++) { + const ptr = wasm.__malloc(1024) + wasm.__free(ptr) + } + + expect(wasm.memory.buffer.byteLength).to.equal(memUsage) + }) +}) diff --git a/packages/wasm/tests/ctr.spec.ts b/packages/wasm/tests/ctr.spec.ts new file mode 100644 index 00000000..95dd5915 --- /dev/null +++ b/packages/wasm/tests/ctr.spec.ts @@ -0,0 +1,149 @@ +/* eslint-disable no-restricted-globals */ +import { expect } from 'chai' +import { before, describe } from 'mocha' + +import { __getWasm, createCtr256, ctr256, freeCtr256, initAsync } from '../src/index.js' + +before(async () => { + await initAsync() +}) + +describe('aes-ctr', () => { + const key = Buffer.from('603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4', 'hex') + const iv = Buffer.from('F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF', 'hex') + + describe('NIST', () => { + // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_CTR.pdf + const data = Buffer.from( + `6BC1BEE2 2E409F96 E93D7E11 7393172A + AE2D8A57 1E03AC9C 9EB76FAC 45AF8E51 + 30C81C46 A35CE411 E5FBC119 1A0A52EF + F69F2445 DF4F9B17 AD2B417B E66C3710`.replace(/\s/g, ''), + 'hex', + ) + const dataEnc = Buffer.from( + `601EC313 775789A5 B7A7F504 BBF3D228 + F443E3CA 4D62B59A CA84E990 CACAF5C5 + 2B0930DA A23DE94C E87017BA 2D84988D + DFC9C58D B67AADA6 13C2DD08 457941A6`.replace(/\s/g, ''), + 'hex', + ) + + it('should correctly encrypt', () => { + const ctr = createCtr256(key, iv) + const res = ctr256(ctr, data) + freeCtr256(ctr) + + expect(Buffer.from(res).toString('hex')).to.equal(dataEnc.toString('hex')) + }) + + it('should correctly decrypt', () => { + const ctr = createCtr256(key, iv) + const res = ctr256(ctr, dataEnc) + freeCtr256(ctr) + + expect(Buffer.from(res).toString('hex')).to.equal(data.toString('hex')) + }) + }) + + describe('stream', () => { + const data = Buffer.from('6BC1BEE22E409F96E93D7E117393172A', 'hex') + const dataEnc1 = Buffer.from('601ec313775789a5b7a7f504bbf3d228', 'hex') + const dataEnc2 = Buffer.from('31afd77f7d218690bd0ef82dfcf66cbe', 'hex') + const dataEnc3 = Buffer.from('7000927e2f2192cbe4b6a8b2441ddd48', 'hex') + + it('should correctly encrypt', () => { + const ctr = createCtr256(key, iv) + const res1 = ctr256(ctr, data) + const res2 = ctr256(ctr, data) + const res3 = ctr256(ctr, data) + + freeCtr256(ctr) + + expect(Buffer.from(res1).toString('hex')).to.equal(dataEnc1.toString('hex')) + expect(Buffer.from(res2).toString('hex')).to.equal(dataEnc2.toString('hex')) + expect(Buffer.from(res3).toString('hex')).to.equal(dataEnc3.toString('hex')) + }) + + it('should correctly decrypt', () => { + const ctr = createCtr256(key, iv) + const res1 = ctr256(ctr, dataEnc1) + const res2 = ctr256(ctr, dataEnc2) + const res3 = ctr256(ctr, dataEnc3) + + freeCtr256(ctr) + + expect(Buffer.from(res1).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res2).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res3).toString('hex')).to.equal(data.toString('hex')) + }) + }) + + describe('stream (unaligned)', () => { + const data = Buffer.from('6BC1BEE22E40', 'hex') + const dataEnc1 = Buffer.from('601ec3137757', 'hex') + const dataEnc2 = Buffer.from('7df2e078a555', 'hex') + const dataEnc3 = Buffer.from('a3a17be0742e', 'hex') + const dataEnc4 = Buffer.from('025ced833746', 'hex') + const dataEnc5 = Buffer.from('3ff238dea125', 'hex') + const dataEnc6 = Buffer.from('1055a52302dc', 'hex') + + it('should correctly encrypt', () => { + const ctr = createCtr256(key, iv) + const res1 = ctr256(ctr, data) + const res2 = ctr256(ctr, data) + const res3 = ctr256(ctr, data) + const res4 = ctr256(ctr, data) + const res5 = ctr256(ctr, data) + const res6 = ctr256(ctr, data) + + freeCtr256(ctr) + + expect(Buffer.from(res1).toString('hex')).to.equal(dataEnc1.toString('hex')) + expect(Buffer.from(res2).toString('hex')).to.equal(dataEnc2.toString('hex')) + expect(Buffer.from(res3).toString('hex')).to.equal(dataEnc3.toString('hex')) + expect(Buffer.from(res4).toString('hex')).to.equal(dataEnc4.toString('hex')) + expect(Buffer.from(res5).toString('hex')).to.equal(dataEnc5.toString('hex')) + expect(Buffer.from(res6).toString('hex')).to.equal(dataEnc6.toString('hex')) + }) + + it('should correctly decrypt', () => { + const ctr = createCtr256(key, iv) + const res1 = ctr256(ctr, dataEnc1) + const res2 = ctr256(ctr, dataEnc2) + const res3 = ctr256(ctr, dataEnc3) + const res4 = ctr256(ctr, dataEnc4) + const res5 = ctr256(ctr, dataEnc5) + const res6 = ctr256(ctr, dataEnc6) + + freeCtr256(ctr) + + expect(Buffer.from(res1).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res2).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res3).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res4).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res5).toString('hex')).to.equal(data.toString('hex')) + expect(Buffer.from(res6).toString('hex')).to.equal(data.toString('hex')) + }) + }) + + it('should not leak memory', () => { + const data = Buffer.from('6BC1BEE22E409F96E93D7E117393172A', 'hex') + const mem = __getWasm().memory.buffer + const memSize = mem.byteLength + + for (let i = 0; i < 100; i++) { + const ctrEnc = createCtr256(key, iv) + const ctrDec = createCtr256(key, iv) + + for (let i = 0; i < 100; i++) { + ctr256(ctrDec, ctr256(ctrEnc, data)) + } + + freeCtr256(ctrEnc) + freeCtr256(ctrDec) + } + + expect(mem.byteLength).to.equal(memSize) + }) +}) diff --git a/packages/wasm/tests/gunzip.spec.ts b/packages/wasm/tests/gunzip.spec.ts new file mode 100644 index 00000000..8111b829 --- /dev/null +++ b/packages/wasm/tests/gunzip.spec.ts @@ -0,0 +1,46 @@ +/* eslint-disable no-restricted-globals */ +import { expect } from 'chai' +import { before, describe } from 'mocha' +import { gzipSync } from 'zlib' + +import { __getWasm, gunzip, initAsync } from '../src/index.js' + +before(async () => { + await initAsync() +}) + +describe('gunzip', () => { + it('should correctly read zlib headers', () => { + const wasm = __getWasm() + const data = gzipSync(Buffer.from('hello world')) + + const inputPtr = wasm.__malloc(data.length) + new Uint8Array(wasm.memory.buffer).set(data, inputPtr) + + expect(wasm.libdeflate_gzip_get_output_size(inputPtr, data.length)).to.equal(11) + }) + + it('should correctly inflate', () => { + const data = Array.from({ length: 1000 }, () => 'a').join('') + const res = gzipSync(Buffer.from(data)) + + expect(res).not.to.be.null + expect(res.length).to.be.lessThan(100) + expect(gunzip(res)).to.deep.equal(new Uint8Array(Buffer.from(data))) + }) + + it('should not leak memory', () => { + const memSize = __getWasm().memory.buffer.byteLength + + for (let i = 0; i < 100; i++) { + const data = Array.from({ length: 1000 }, () => 'a').join('') + const deflated = gzipSync(Buffer.from(data)) + + const res = gunzip(deflated) + + expect(Buffer.from(res).toString()).to.equal(data) + } + + expect(__getWasm().memory.buffer.byteLength).to.equal(memSize) + }) +}) diff --git a/packages/wasm/tests/ige.spec.ts b/packages/wasm/tests/ige.spec.ts new file mode 100644 index 00000000..6bfcd2bd --- /dev/null +++ b/packages/wasm/tests/ige.spec.ts @@ -0,0 +1,40 @@ +/* eslint-disable no-restricted-globals */ +import { expect } from 'chai' +import { before, describe } from 'mocha' + +import { __getWasm, ige256Decrypt, ige256Encrypt, initAsync } from '../src/index.js' + +before(async () => { + await initAsync() +}) + +describe('aes-ige', () => { + const key = Buffer.from('5468697320697320616E20696D706C655468697320697320616E20696D706C65', 'hex') + const iv = Buffer.from('6D656E746174696F6E206F6620494745206D6F646520666F72204F70656E5353', 'hex') + + const data = Buffer.from('99706487a1cde613bc6de0b6f24b1c7aa448c8b9c3403e3467a8cad89340f53b', 'hex') + const dataEnc = Buffer.from('792ea8ae577b1a66cb3bd92679b8030ca54ee631976bd3a04547fdcb4639fa69', 'hex') + + it('should correctly encrypt', () => { + const aes = ige256Encrypt(data, key, iv) + + expect(Buffer.from(aes).toString('hex')).to.equal(dataEnc.toString('hex')) + }) + + it('should correctly decrypt', () => { + const aes = ige256Decrypt(dataEnc, key, iv) + + expect(Buffer.from(aes).toString('hex')).to.equal(data.toString('hex')) + }) + + it('should not leak memory', () => { + const mem = __getWasm().memory.buffer + const memSize = mem.byteLength + + for (let i = 0; i < 10000; i++) { + ige256Decrypt(ige256Encrypt(data, key, iv), key, iv) + } + + expect(mem.byteLength).to.equal(memSize) + }) +}) diff --git a/packages/wasm/tests/tsconfig.json b/packages/wasm/tests/tsconfig.json new file mode 100644 index 00000000..23b6b033 --- /dev/null +++ b/packages/wasm/tests/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../../tsconfig.json", + "include": [ + "." + ], + "references": [ + { "path": "../" } + ] +} diff --git a/packages/wasm/tests/zlib.spec.ts b/packages/wasm/tests/zlib.spec.ts new file mode 100644 index 00000000..7a35dc90 --- /dev/null +++ b/packages/wasm/tests/zlib.spec.ts @@ -0,0 +1,49 @@ +/* eslint-disable no-restricted-globals */ +import { expect } from 'chai' +import { before, describe } from 'mocha' +import { inflateSync } from 'zlib' + +import { __getWasm, deflateMaxSize, initAsync } from '../src/index.js' + +before(async () => { + await initAsync() +}) + +describe('zlib deflate', () => { + it('should add zlib headers', () => { + const res = deflateMaxSize(Buffer.from('hello world'), 100) + + expect(res).not.to.be.null + expect(res!.slice(0, 2)).to.deep.equal(Buffer.from([0x78, 0x9c])) + }) + + it('should return null if compressed data is larger than size', () => { + const res = deflateMaxSize(Buffer.from('hello world'), 1) + + expect(res).to.be.null + }) + + it('should correctly deflate', () => { + const data = Array.from({ length: 1000 }, () => 'a').join('') + const res = deflateMaxSize(Buffer.from(data), 100) + + expect(res).not.to.be.null + expect(res!.length).to.be.lessThan(100) + expect(inflateSync(res!)).to.deep.equal(Buffer.from(data)) + }) + + it('should not leak memory', () => { + const memSize = __getWasm().memory.buffer.byteLength + + for (let i = 0; i < 100; i++) { + const data = Array.from({ length: 1000 }, () => 'a').join('') + const deflated = deflateMaxSize(Buffer.from(data), 100) + + const res = inflateSync(deflated!) + + expect(Buffer.from(res).toString()).to.equal(data) + } + + expect(__getWasm().memory.buffer.byteLength).to.equal(memSize) + }) +}) diff --git a/packages/wasm/tsconfig.json b/packages/wasm/tsconfig.json new file mode 100644 index 00000000..6a3f7a53 --- /dev/null +++ b/packages/wasm/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "outDir": "./dist/esm", + "rootDir": "./src" + }, + "include": [ + "./src", + ] +} diff --git a/packages/wasm/typedoc.cjs b/packages/wasm/typedoc.cjs new file mode 100644 index 00000000..84e0d8c2 --- /dev/null +++ b/packages/wasm/typedoc.cjs @@ -0,0 +1,4 @@ +module.exports = { + extends: ['../../typedoc.base.cjs'], + entryPoints: ['./src/index.ts'], +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 56488ef2..2bde8c29 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -126,6 +126,9 @@ importers: '@mtcute/tl-runtime': specifier: workspace:^ version: link:../tl-runtime + '@mtcute/wasm': + specifier: workspace:^ + version: link:../wasm '@types/events': specifier: 3.0.0 version: 3.0.0 @@ -139,9 +142,6 @@ importers: specifier: 5.2.3 version: 5.2.3 devDependencies: - '@cryptography/aes': - specifier: ^0.1.1 - version: 0.1.1 '@types/ws': specifier: 8.5.4 version: 8.5.4 @@ -312,13 +312,6 @@ importers: long: specifier: 5.2.3 version: 5.2.3 - pako: - specifier: 2.1.0 - version: 2.1.0 - devDependencies: - '@types/pako': - specifier: 2.0.0 - version: 2.0.0 packages/tl-utils: dependencies: @@ -330,6 +323,8 @@ importers: specifier: workspace:^ version: link:../tl-runtime + packages/wasm: {} + packages: /@aashutoshrathi/word-wrap@1.2.6: @@ -702,10 +697,6 @@ packages: chalk: 4.1.2 dev: true - /@cryptography/aes@0.1.1: - resolution: {integrity: sha512-PcYz4FDGblO6tM2kSC+VzhhK62vml6k6/YAkiWtyPvrgJVfnDRoHGDtKn5UiaRRUrvUTTocBpvc2rRgTCqxjsg==} - dev: true - /@cspotcode/source-map-support@0.8.1: resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} engines: {node: '>=12'} @@ -968,10 +959,6 @@ packages: resolution: {integrity: sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw==} dev: true - /@types/pako@2.0.0: - resolution: {integrity: sha512-10+iaz93qR5WYxTo+PMifD5TSxiOtdRaxBf7INGGXMQgTCu8Z/7GYWYFUOS3q/G0nE5boj1r4FEB+WSy7s5gbA==} - dev: true - /@types/semver@7.5.0: resolution: {integrity: sha512-G8hZ6XJiHnuhQKR7ZmysCeJWE08o8T0AXtk5darsCaTVsYZhhgUrq53jizaR2FvsoeCwJhlmwTjkXBY5Pn/ZHw==} dev: true @@ -4311,10 +4298,6 @@ packages: release-zalgo: 1.0.0 dev: true - /pako@2.1.0: - resolution: {integrity: sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug==} - dev: false - /parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} diff --git a/scripts/build-package.js b/scripts/build-package.js index 4aa645a2..2512d761 100644 --- a/scripts/build-package.js +++ b/scripts/build-package.js @@ -50,14 +50,10 @@ const buildConfig = { }) } - console.log(config) - return config })(), } -console.log(buildConfig) - function buildPackageJson() { const pkgJson = JSON.parse(fs.readFileSync(path.join(packageDir, 'package.json'), 'utf-8'))