777 lines
24 KiB
C
777 lines
24 KiB
C
/*
|
|
* decompress_template.h
|
|
*
|
|
* Copyright 2016 Eric Biggers
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use,
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following
|
|
* conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* This is the actual DEFLATE decompression routine, lifted out of
|
|
* deflate_decompress.c so that it can be compiled multiple times with different
|
|
* target instruction sets.
|
|
*/
|
|
|
|
#ifndef ATTRIBUTES
|
|
# define ATTRIBUTES
|
|
#endif
|
|
#ifndef EXTRACT_VARBITS
|
|
# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count))
|
|
#endif
|
|
#ifndef EXTRACT_VARBITS8
|
|
# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
|
|
#endif
|
|
|
|
static enum libdeflate_result ATTRIBUTES
|
|
FUNCNAME(struct libdeflate_decompressor * restrict d,
|
|
const void * restrict in, size_t in_nbytes,
|
|
void * restrict out, size_t out_nbytes_avail,
|
|
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
|
|
{
|
|
u8 *out_next = out;
|
|
u8 * const out_end = out_next + out_nbytes_avail;
|
|
u8 * const out_fastloop_end =
|
|
out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
|
|
|
|
/* Input bitstream state; see deflate_decompress.c for documentation */
|
|
const u8 *in_next = in;
|
|
const u8 * const in_end = in_next + in_nbytes;
|
|
const u8 * const in_fastloop_end =
|
|
in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
|
|
bitbuf_t bitbuf = 0;
|
|
bitbuf_t saved_bitbuf;
|
|
u32 bitsleft = 0;
|
|
size_t overread_count = 0;
|
|
|
|
bool is_final_block;
|
|
unsigned block_type;
|
|
unsigned num_litlen_syms;
|
|
unsigned num_offset_syms;
|
|
bitbuf_t litlen_tablemask;
|
|
u32 entry;
|
|
|
|
next_block:
|
|
/* Starting to read the next block */
|
|
;
|
|
|
|
STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
|
|
REFILL_BITS();
|
|
|
|
/* BFINAL: 1 bit */
|
|
is_final_block = bitbuf & BITMASK(1);
|
|
|
|
/* BTYPE: 2 bits */
|
|
block_type = (bitbuf >> 1) & BITMASK(2);
|
|
|
|
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
|
|
|
|
/* Dynamic Huffman block */
|
|
|
|
/* The order in which precode lengths are stored */
|
|
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
|
|
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
|
};
|
|
|
|
unsigned num_explicit_precode_lens;
|
|
unsigned i;
|
|
|
|
/* Read the codeword length counts. */
|
|
|
|
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
|
|
num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
|
|
|
|
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
|
|
num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
|
|
|
|
STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
|
|
num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
|
|
|
|
d->static_codes_loaded = false;
|
|
|
|
/*
|
|
* Read the precode codeword lengths.
|
|
*
|
|
* A 64-bit bitbuffer is just one bit too small to hold the
|
|
* maximum number of precode lens, so to minimize branches we
|
|
* merge one len with the previous fields.
|
|
*/
|
|
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
|
|
if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
|
|
d->u.precode_lens[deflate_precode_lens_permutation[0]] =
|
|
(bitbuf >> 17) & BITMASK(3);
|
|
bitbuf >>= 20;
|
|
bitsleft -= 20;
|
|
REFILL_BITS();
|
|
i = 1;
|
|
do {
|
|
d->u.precode_lens[deflate_precode_lens_permutation[i]] =
|
|
bitbuf & BITMASK(3);
|
|
bitbuf >>= 3;
|
|
bitsleft -= 3;
|
|
} while (++i < num_explicit_precode_lens);
|
|
} else {
|
|
bitbuf >>= 17;
|
|
bitsleft -= 17;
|
|
i = 0;
|
|
do {
|
|
if ((u8)bitsleft < 3)
|
|
REFILL_BITS();
|
|
d->u.precode_lens[deflate_precode_lens_permutation[i]] =
|
|
bitbuf & BITMASK(3);
|
|
bitbuf >>= 3;
|
|
bitsleft -= 3;
|
|
} while (++i < num_explicit_precode_lens);
|
|
}
|
|
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
|
|
d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
|
|
|
|
/* Build the decode table for the precode. */
|
|
SAFETY_CHECK(build_precode_decode_table(d));
|
|
|
|
/* Decode the litlen and offset codeword lengths. */
|
|
i = 0;
|
|
do {
|
|
unsigned presym;
|
|
u8 rep_val;
|
|
unsigned rep_count;
|
|
|
|
if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
|
|
REFILL_BITS();
|
|
|
|
/*
|
|
* The code below assumes that the precode decode table
|
|
* doesn't have any subtables.
|
|
*/
|
|
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
|
|
|
|
/* Decode the next precode symbol. */
|
|
entry = d->u.l.precode_decode_table[
|
|
bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry; /* optimization: subtract full entry */
|
|
presym = entry >> 16;
|
|
|
|
if (presym < 16) {
|
|
/* Explicit codeword length */
|
|
d->u.l.lens[i++] = presym;
|
|
continue;
|
|
}
|
|
|
|
/* Run-length encoded codeword lengths */
|
|
|
|
/*
|
|
* Note: we don't need to immediately verify that the
|
|
* repeat count doesn't overflow the number of elements,
|
|
* since we've sized the lens array to have enough extra
|
|
* space to allow for the worst-case overrun (138 zeroes
|
|
* when only 1 length was remaining).
|
|
*
|
|
* In the case of the small repeat counts (presyms 16
|
|
* and 17), it is fastest to always write the maximum
|
|
* number of entries. That gets rid of branches that
|
|
* would otherwise be required.
|
|
*
|
|
* It is not just because of the numerical order that
|
|
* our checks go in the order 'presym < 16', 'presym ==
|
|
* 16', and 'presym == 17'. For typical data this is
|
|
* ordered from most frequent to least frequent case.
|
|
*/
|
|
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
|
|
|
|
if (presym == 16) {
|
|
/* Repeat the previous length 3 - 6 times. */
|
|
SAFETY_CHECK(i != 0);
|
|
rep_val = d->u.l.lens[i - 1];
|
|
STATIC_ASSERT(3 + BITMASK(2) == 6);
|
|
rep_count = 3 + (bitbuf & BITMASK(2));
|
|
bitbuf >>= 2;
|
|
bitsleft -= 2;
|
|
d->u.l.lens[i + 0] = rep_val;
|
|
d->u.l.lens[i + 1] = rep_val;
|
|
d->u.l.lens[i + 2] = rep_val;
|
|
d->u.l.lens[i + 3] = rep_val;
|
|
d->u.l.lens[i + 4] = rep_val;
|
|
d->u.l.lens[i + 5] = rep_val;
|
|
i += rep_count;
|
|
} else if (presym == 17) {
|
|
/* Repeat zero 3 - 10 times. */
|
|
STATIC_ASSERT(3 + BITMASK(3) == 10);
|
|
rep_count = 3 + (bitbuf & BITMASK(3));
|
|
bitbuf >>= 3;
|
|
bitsleft -= 3;
|
|
d->u.l.lens[i + 0] = 0;
|
|
d->u.l.lens[i + 1] = 0;
|
|
d->u.l.lens[i + 2] = 0;
|
|
d->u.l.lens[i + 3] = 0;
|
|
d->u.l.lens[i + 4] = 0;
|
|
d->u.l.lens[i + 5] = 0;
|
|
d->u.l.lens[i + 6] = 0;
|
|
d->u.l.lens[i + 7] = 0;
|
|
d->u.l.lens[i + 8] = 0;
|
|
d->u.l.lens[i + 9] = 0;
|
|
i += rep_count;
|
|
} else {
|
|
/* Repeat zero 11 - 138 times. */
|
|
STATIC_ASSERT(11 + BITMASK(7) == 138);
|
|
rep_count = 11 + (bitbuf & BITMASK(7));
|
|
bitbuf >>= 7;
|
|
bitsleft -= 7;
|
|
__builtin_memset(&d->u.l.lens[i], 0,
|
|
rep_count * sizeof(d->u.l.lens[i]));
|
|
i += rep_count;
|
|
}
|
|
} while (i < num_litlen_syms + num_offset_syms);
|
|
|
|
/* Unnecessary, but check this for consistency with zlib. */
|
|
SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
|
|
|
|
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
|
|
u16 len, nlen;
|
|
|
|
/*
|
|
* Uncompressed block: copy 'len' bytes literally from the input
|
|
* buffer to the output buffer.
|
|
*/
|
|
|
|
bitsleft -= 3; /* for BTYPE and BFINAL */
|
|
|
|
/*
|
|
* Align the bitstream to the next byte boundary. This means
|
|
* the next byte boundary as if we were reading a byte at a
|
|
* time. Therefore, we have to rewind 'in_next' by any bytes
|
|
* that have been refilled but not actually consumed yet (not
|
|
* counting overread bytes, which don't increment 'in_next').
|
|
*/
|
|
bitsleft = (u8)bitsleft;
|
|
SAFETY_CHECK(overread_count <= (bitsleft >> 3));
|
|
in_next -= (bitsleft >> 3) - overread_count;
|
|
overread_count = 0;
|
|
bitbuf = 0;
|
|
bitsleft = 0;
|
|
|
|
SAFETY_CHECK(in_end - in_next >= 4);
|
|
len = get_unaligned_le16(in_next);
|
|
nlen = get_unaligned_le16(in_next + 2);
|
|
in_next += 4;
|
|
|
|
SAFETY_CHECK(len == (u16)~nlen);
|
|
if (unlikely(len > out_end - out_next))
|
|
return LIBDEFLATE_INSUFFICIENT_SPACE;
|
|
SAFETY_CHECK(len <= in_end - in_next);
|
|
|
|
__builtin_memcpy(out_next, in_next, len);
|
|
in_next += len;
|
|
out_next += len;
|
|
|
|
goto block_done;
|
|
|
|
} else {
|
|
unsigned i;
|
|
|
|
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
|
|
|
|
/*
|
|
* Static Huffman block: build the decode tables for the static
|
|
* codes. Skip doing so if the tables are already set up from
|
|
* an earlier static block; this speeds up decompression of
|
|
* degenerate input of many empty or very short static blocks.
|
|
*
|
|
* Afterwards, the remainder is the same as decompressing a
|
|
* dynamic Huffman block.
|
|
*/
|
|
|
|
bitbuf >>= 3; /* for BTYPE and BFINAL */
|
|
bitsleft -= 3;
|
|
|
|
if (d->static_codes_loaded)
|
|
goto have_decode_tables;
|
|
|
|
d->static_codes_loaded = true;
|
|
|
|
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
|
|
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
|
|
|
|
for (i = 0; i < 144; i++)
|
|
d->u.l.lens[i] = 8;
|
|
for (; i < 256; i++)
|
|
d->u.l.lens[i] = 9;
|
|
for (; i < 280; i++)
|
|
d->u.l.lens[i] = 7;
|
|
for (; i < 288; i++)
|
|
d->u.l.lens[i] = 8;
|
|
|
|
for (; i < 288 + 32; i++)
|
|
d->u.l.lens[i] = 5;
|
|
|
|
num_litlen_syms = 288;
|
|
num_offset_syms = 32;
|
|
}
|
|
|
|
/* Decompressing a Huffman block (either dynamic or static) */
|
|
|
|
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
|
|
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
|
|
have_decode_tables:
|
|
litlen_tablemask = BITMASK(d->litlen_tablebits);
|
|
|
|
/*
|
|
* This is the "fastloop" for decoding literals and matches. It does
|
|
* bounds checks on in_next and out_next in the loop conditions so that
|
|
* additional bounds checks aren't needed inside the loop body.
|
|
*
|
|
* To reduce latency, the bitbuffer is refilled and the next litlen
|
|
* decode table entry is preloaded before each loop iteration.
|
|
*/
|
|
if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
|
|
goto generic_loop;
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
do {
|
|
u32 length, offset, lit;
|
|
const u8 *src;
|
|
u8 *dst;
|
|
|
|
/*
|
|
* Consume the bits for the litlen decode table entry. Save the
|
|
* original bitbuf for later, in case the extra match length
|
|
* bits need to be extracted from it.
|
|
*/
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry; /* optimization: subtract full entry */
|
|
|
|
/*
|
|
* Begin by checking for a "fast" literal, i.e. a literal that
|
|
* doesn't need a subtable.
|
|
*/
|
|
if (entry & HUFFDEC_LITERAL) {
|
|
/*
|
|
* On 64-bit platforms, we decode up to 2 extra fast
|
|
* literals in addition to the primary item, as this
|
|
* increases performance and still leaves enough bits
|
|
* remaining for what follows. We could actually do 3,
|
|
* assuming LITLEN_TABLEBITS=11, but that actually
|
|
* decreases performance slightly (perhaps by messing
|
|
* with the branch prediction of the conditional refill
|
|
* that happens later while decoding the match offset).
|
|
*
|
|
* Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
|
|
* and FASTLOOP_MAX_BYTES_READ need to be updated if the
|
|
* number of extra literals decoded here is changed.
|
|
*/
|
|
if (/* enough bits for 2 fast literals + length + offset preload? */
|
|
CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
|
|
LENGTH_MAXBITS,
|
|
OFFSET_TABLEBITS) &&
|
|
/* enough bits for 2 fast literals + slow literal + litlen preload? */
|
|
CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
|
|
DEFLATE_MAX_LITLEN_CODEWORD_LEN,
|
|
LITLEN_TABLEBITS)) {
|
|
/* 1st extra fast literal */
|
|
lit = entry >> 16;
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry;
|
|
*out_next++ = lit;
|
|
if (entry & HUFFDEC_LITERAL) {
|
|
/* 2nd extra fast literal */
|
|
lit = entry >> 16;
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry;
|
|
*out_next++ = lit;
|
|
if (entry & HUFFDEC_LITERAL) {
|
|
/*
|
|
* Another fast literal, but
|
|
* this one is in lieu of the
|
|
* primary item, so it doesn't
|
|
* count as one of the extras.
|
|
*/
|
|
lit = entry >> 16;
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
*out_next++ = lit;
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
/*
|
|
* Decode a literal. While doing so, preload
|
|
* the next litlen decode table entry and refill
|
|
* the bitbuffer. To reduce latency, we've
|
|
* arranged for there to be enough "preloadable"
|
|
* bits remaining to do the table preload
|
|
* independently of the refill.
|
|
*/
|
|
STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
|
|
LITLEN_TABLEBITS, LITLEN_TABLEBITS));
|
|
lit = entry >> 16;
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
*out_next++ = lit;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* It's not a literal entry, so it can be a length entry, a
|
|
* subtable pointer entry, or an end-of-block entry. Detect the
|
|
* two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
|
|
*/
|
|
if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
|
|
/* Subtable pointer or end-of-block entry */
|
|
|
|
if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
|
|
goto block_done;
|
|
|
|
/*
|
|
* A subtable is required. Load and consume the
|
|
* subtable entry. The subtable entry can be of any
|
|
* type: literal, length, or end-of-block.
|
|
*/
|
|
entry = d->u.litlen_decode_table[(entry >> 16) +
|
|
EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry;
|
|
|
|
/*
|
|
* 32-bit platforms that use the byte-at-a-time refill
|
|
* method have to do a refill here for there to always
|
|
* be enough bits to decode a literal that requires a
|
|
* subtable, then preload the next litlen decode table
|
|
* entry; or to decode a match length that requires a
|
|
* subtable, then preload the offset decode table entry.
|
|
*/
|
|
if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
|
|
LITLEN_TABLEBITS) ||
|
|
!CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
|
|
OFFSET_TABLEBITS))
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
if (entry & HUFFDEC_LITERAL) {
|
|
/* Decode a literal that required a subtable. */
|
|
lit = entry >> 16;
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
*out_next++ = lit;
|
|
continue;
|
|
}
|
|
if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
|
|
goto block_done;
|
|
/* Else, it's a length that required a subtable. */
|
|
}
|
|
|
|
/*
|
|
* Decode the match length: the length base value associated
|
|
* with the litlen symbol (which we extract from the decode
|
|
* table entry), plus the extra length bits. We don't need to
|
|
* consume the extra length bits here, as they were included in
|
|
* the bits consumed by the entry earlier. We also don't need
|
|
* to check for too-long matches here, as this is inside the
|
|
* fastloop where it's already been verified that the output
|
|
* buffer has enough space remaining to copy a max-length match.
|
|
*/
|
|
length = entry >> 16;
|
|
length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
|
|
|
|
/*
|
|
* Decode the match offset. There are enough "preloadable" bits
|
|
* remaining to preload the offset decode table entry, but a
|
|
* refill might be needed before consuming it.
|
|
*/
|
|
STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
|
|
OFFSET_TABLEBITS));
|
|
entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
|
|
if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
|
|
LITLEN_TABLEBITS)) {
|
|
/*
|
|
* Decoding a match offset on a 64-bit platform. We may
|
|
* need to refill once, but then we can decode the whole
|
|
* offset and preload the next litlen table entry.
|
|
*/
|
|
if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
|
|
/* Offset codeword requires a subtable */
|
|
if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
|
|
LITLEN_TABLEBITS - PRELOAD_SLACK))
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
bitbuf >>= OFFSET_TABLEBITS;
|
|
bitsleft -= OFFSET_TABLEBITS;
|
|
entry = d->offset_decode_table[(entry >> 16) +
|
|
EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
|
|
} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
|
|
LITLEN_TABLEBITS - PRELOAD_SLACK))
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
} else {
|
|
/* Decoding a match offset on a 32-bit platform */
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
|
|
/* Offset codeword requires a subtable */
|
|
bitbuf >>= OFFSET_TABLEBITS;
|
|
bitsleft -= OFFSET_TABLEBITS;
|
|
entry = d->offset_decode_table[(entry >> 16) +
|
|
EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
/* No further refill needed before extra bits */
|
|
STATIC_ASSERT(CAN_CONSUME(
|
|
OFFSET_MAXBITS - OFFSET_TABLEBITS));
|
|
} else {
|
|
/* No refill needed before extra bits */
|
|
STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
|
|
}
|
|
}
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry; /* optimization: subtract full entry */
|
|
offset = entry >> 16;
|
|
offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
|
|
|
|
/* Validate the match offset; needed even in the fastloop. */
|
|
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
|
|
src = out_next - offset;
|
|
dst = out_next;
|
|
out_next += length;
|
|
|
|
/*
|
|
* Before starting to issue the instructions to copy the match,
|
|
* refill the bitbuffer and preload the litlen decode table
|
|
* entry for the next loop iteration. This can increase
|
|
* performance by allowing the latency of the match copy to
|
|
* overlap with these other operations. To further reduce
|
|
* latency, we've arranged for there to be enough bits remaining
|
|
* to do the table preload independently of the refill, except
|
|
* on 32-bit platforms using the byte-at-a-time refill method.
|
|
*/
|
|
if (!CAN_CONSUME_AND_THEN_PRELOAD(
|
|
MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
|
|
OFFSET_MAXFASTBITS),
|
|
LITLEN_TABLEBITS) &&
|
|
unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
REFILL_BITS_IN_FASTLOOP();
|
|
|
|
/*
|
|
* Copy the match. On most CPUs the fastest method is a
|
|
* word-at-a-time copy, unconditionally copying about 5 words
|
|
* since this is enough for most matches without being too much.
|
|
*
|
|
* The normal word-at-a-time copy works for offset >= WORDBYTES,
|
|
* which is most cases. The case of offset == 1 is also common
|
|
* and is worth optimizing for, since it is just RLE encoding of
|
|
* the previous byte, which is the result of compressing long
|
|
* runs of the same byte.
|
|
*
|
|
* Writing past the match 'length' is allowed here, since it's
|
|
* been ensured there is enough output space left for a slight
|
|
* overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
|
|
* the maximum possible overrun here is changed.
|
|
*/
|
|
if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
while (dst < out_next) {
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += WORDBYTES;
|
|
dst += WORDBYTES;
|
|
}
|
|
} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
|
|
machine_word_t v;
|
|
|
|
/*
|
|
* This part tends to get auto-vectorized, so keep it
|
|
* copying a multiple of 16 bytes at a time.
|
|
*/
|
|
v = (machine_word_t)0x0101010101010101 * src[0];
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
while (dst < out_next) {
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
store_word_unaligned(v, dst);
|
|
dst += WORDBYTES;
|
|
}
|
|
} else if (UNALIGNED_ACCESS_IS_FAST) {
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += offset;
|
|
dst += offset;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += offset;
|
|
dst += offset;
|
|
do {
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += offset;
|
|
dst += offset;
|
|
store_word_unaligned(load_word_unaligned(src), dst);
|
|
src += offset;
|
|
dst += offset;
|
|
} while (dst < out_next);
|
|
} else {
|
|
*dst++ = *src++;
|
|
*dst++ = *src++;
|
|
do {
|
|
*dst++ = *src++;
|
|
} while (dst < out_next);
|
|
}
|
|
} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
|
|
|
|
/*
|
|
* This is the generic loop for decoding literals and matches. This
|
|
* handles cases where in_next and out_next are close to the end of
|
|
* their respective buffers. Usually this loop isn't performance-
|
|
* critical, as most time is spent in the fastloop above instead. We
|
|
* therefore omit some optimizations here in favor of smaller code.
|
|
*/
|
|
generic_loop:
|
|
for (;;) {
|
|
u32 length, offset;
|
|
const u8 *src;
|
|
u8 *dst;
|
|
|
|
REFILL_BITS();
|
|
entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry;
|
|
if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
|
|
entry = d->u.litlen_decode_table[(entry >> 16) +
|
|
EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
|
|
saved_bitbuf = bitbuf;
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry;
|
|
}
|
|
length = entry >> 16;
|
|
if (entry & HUFFDEC_LITERAL) {
|
|
if (unlikely(out_next == out_end))
|
|
return LIBDEFLATE_INSUFFICIENT_SPACE;
|
|
*out_next++ = length;
|
|
continue;
|
|
}
|
|
if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
|
|
goto block_done;
|
|
length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
|
|
if (unlikely(length > out_end - out_next))
|
|
return LIBDEFLATE_INSUFFICIENT_SPACE;
|
|
|
|
if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
|
|
REFILL_BITS();
|
|
entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
|
|
if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
|
|
bitbuf >>= OFFSET_TABLEBITS;
|
|
bitsleft -= OFFSET_TABLEBITS;
|
|
entry = d->offset_decode_table[(entry >> 16) +
|
|
EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
|
|
if (!CAN_CONSUME(OFFSET_MAXBITS))
|
|
REFILL_BITS();
|
|
}
|
|
offset = entry >> 16;
|
|
offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
|
|
bitbuf >>= (u8)entry;
|
|
bitsleft -= entry;
|
|
|
|
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
|
|
src = out_next - offset;
|
|
dst = out_next;
|
|
out_next += length;
|
|
|
|
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
|
|
*dst++ = *src++;
|
|
*dst++ = *src++;
|
|
do {
|
|
*dst++ = *src++;
|
|
} while (dst < out_next);
|
|
}
|
|
|
|
block_done:
|
|
/* Finished decoding a block */
|
|
|
|
if (!is_final_block)
|
|
goto next_block;
|
|
|
|
/* That was the last block. */
|
|
|
|
bitsleft = (u8)bitsleft;
|
|
|
|
/*
|
|
* If any of the implicit appended zero bytes were consumed (not just
|
|
* refilled) before hitting end of stream, then the data is bad.
|
|
*/
|
|
SAFETY_CHECK(overread_count <= (bitsleft >> 3));
|
|
|
|
/* Optionally return the actual number of bytes consumed. */
|
|
if (actual_in_nbytes_ret) {
|
|
/* Don't count bytes that were refilled but not consumed. */
|
|
in_next -= (bitsleft >> 3) - overread_count;
|
|
|
|
*actual_in_nbytes_ret = in_next - (u8 *)in;
|
|
}
|
|
|
|
/* Optionally return the actual number of bytes written. */
|
|
if (actual_out_nbytes_ret) {
|
|
*actual_out_nbytes_ret = out_next - (u8 *)out;
|
|
} else {
|
|
if (out_next != out_end)
|
|
return LIBDEFLATE_SHORT_OUTPUT;
|
|
}
|
|
return LIBDEFLATE_SUCCESS;
|
|
}
|
|
|
|
#undef FUNCNAME
|
|
#undef ATTRIBUTES
|
|
#undef EXTRACT_VARBITS
|
|
#undef EXTRACT_VARBITS8
|