xz 源码阅读 - 2
上篇文章说到coder->sequence被设置成了“SEQ_BLOCK”。
// Fall through
case SEQ_BLOCK: {
const lzma_ret ret = coder->block_decoder.code(
coder->block_decoder.coder, allocator,
in, in_pos, in_size, out, out_pos, out_size,
action);
if (ret != LZMA_STREAM_END)
return ret;
// Block decoded successfully. Add the new size pair to
// the Index hash.
return_if_error(lzma_index_hash_append(coder->index_hash,
lzma_block_unpadded_size(
&coder->block_options),
coder->block_options.uncompressed_size));
coder->sequence = SEQ_BLOCK_HEADER;
break;
}
随后,这里就开始调用block_decoder.code。前面他被设置成了block_decode:
next->coder = coder;
next->code = &block_decode;
next->end = &block_decoder_end;
coder->next = LZMA_NEXT_CODER_INIT;
// Basic initializations
coder->sequence = SEQ_CODE;
coder->block = block;
coder->compressed_size = 0;
coder->uncompressed_size = 0;
因此,我们查看block_decode的代码。由于代码比较长,这里继续加序号阅读。
static lzma_ret
block_decode(void *coder_ptr, const lzma_allocator *allocator,
const uint8_t *restrict in, size_t *restrict in_pos,
size_t in_size, uint8_t *restrict out,
size_t *restrict out_pos, size_t out_size, lzma_action action)
{
lzma_block_coder *coder = coder_ptr;
block_decode:1. 注意这里的coder->sequence是另一个“sequence”,它最开始是被初始化成SEQ_CODE的。因此第一步从调用coder->next.code开始。
coder = coder_ptr(参数1)是上一步的coder->block_decoder.coder,因此这个调用会调用:coder->block_decoder.coder->next.coder,这些decoder由filter决定。
switch (coder->sequence) {
case SEQ_CODE: {
const size_t in_start = *in_pos;
const size_t out_start = *out_pos;
const lzma_ret ret = coder->next.code(coder->next.coder,
allocator, in, in_pos, in_size,
out, out_pos, out_size, action);
const size_t in_used = *in_pos - in_start;
const size_t out_used = *out_pos - out_start;
// NOTE: We compare to compressed_limit here, which prevents
// the total size of the Block growing past LZMA_VLI_MAX.
if (update_size(&coder->compressed_size, in_used,
coder->compressed_limit)
|| update_size(&coder->uncompressed_size,
out_used,
coder->block->uncompressed_size))
return LZMA_DATA_ERROR;
block_decode:2. 调用完成后,调用lzma_check_update。确定CRC32/64并校验。完成后,设置seq为SEQ_PADDING。
if (!coder->ignore_check)
lzma_check_update(&coder->check, coder->block->check,
out + out_start, out_used);
if (ret != LZMA_STREAM_END)
return ret;
// Compressed and Uncompressed Sizes are now at their final
// values. Verify that they match the values given to us.
if (!is_size_valid(coder->compressed_size,
coder->block->compressed_size)
|| !is_size_valid(coder->uncompressed_size,
coder->block->uncompressed_size))
return LZMA_DATA_ERROR;
// Copy the values into coder->block. The caller
// may use this information to construct Index.
coder->block->compressed_size = coder->compressed_size;
coder->block->uncompressed_size = coder->uncompressed_size;
coder->sequence = SEQ_PADDING;
}
block_decode:3. 进入SEQ_PADDING状态,将字符填充到4字节对齐状态。
// Fall through
case SEQ_PADDING:
// Compressed Data is padded to a multiple of four bytes.
while (coder->compressed_size & 3) {
if (*in_pos >= in_size)
return LZMA_OK;
// We use compressed_size here just get the Padding
// right. The actual Compressed Size was stored to
// coder->block already, and won't be modified by
// us anymore.
++coder->compressed_size;
if (in[(*in_pos)++] != 0x00)
return LZMA_DATA_ERROR;
}
if (coder->block->check == LZMA_CHECK_NONE)
return LZMA_STREAM_END;
if (!coder->ignore_check)
lzma_check_finish(&coder->check, coder->block->check);
coder->sequence = SEQ_CHECK;
block_decode:4. 进入SEQ_CHECK状态,首先获取checksize,拷贝原始数据并进行比较。随后结束。
// Fall through
case SEQ_CHECK: {
const size_t check_size = lzma_check_size(coder->block->check);
lzma_bufcpy(in, in_pos, in_size, coder->block->raw_check,
&coder->check_pos, check_size);
if (coder->check_pos < check_size)
return LZMA_OK;
// Validate the Check only if we support it.
// coder->check.buffer may be uninitialized
// when the Check ID is not supported.
if (!coder->ignore_check
&& lzma_check_is_supported(coder->block->check)
&& memcmp(coder->block->raw_check,
coder->check.buffer.u8,
check_size) != 0)
return LZMA_DATA_ERROR;
return LZMA_STREAM_END;
}
}
return LZMA_PROG_ERROR;
}
回到最开始的地方,进入 coder->sequence = SEQ_BLOCK_HEADER; 状态。这是解压的上一个状态,我们跳过看下一个SEQ_INDEX。
case SEQ_INDEX: {
// If we don't have any input, don't call
// lzma_index_hash_decode() since it would return
// LZMA_BUF_ERROR, which we must not do here.
if (*in_pos >= in_size)
return LZMA_OK;
// Decode the Index and compare it to the hash calculated
// from the sizes of the Blocks (if any).
const lzma_ret ret = lzma_index_hash_decode(coder->index_hash,
in, in_pos, in_size);
if (ret != LZMA_STREAM_END)
return ret;
coder->sequence = SEQ_STREAM_FOOTER;
}
// Fall through
主代码为lzma_index_hash_decode。整体代码非常简单,这里不多做介绍了。
extern LZMA_API(lzma_ret)
lzma_index_hash_decode(lzma_index_hash *index_hash, const uint8_t *in,
size_t *in_pos, size_t in_size)
{
// Catch zero input buffer here, because in contrast to Index encoder
// and decoder functions, applications call this function directly
// instead of via lzma_code(), which does the buffer checking.
if (*in_pos >= in_size)
return LZMA_BUF_ERROR;
// NOTE: This function has many similarities to index_encode() and
// index_decode() functions found from index_encoder.c and
// index_decoder.c. See the comments especially in index_encoder.c.
const size_t in_start = *in_pos;
lzma_ret ret = LZMA_OK;
while (*in_pos < in_size)
switch (index_hash->sequence) {
case SEQ_BLOCK:
// Check the Index Indicator is present.
if (in[(*in_pos)++] != 0x00)
return LZMA_DATA_ERROR;
index_hash->sequence = SEQ_COUNT;
break;
case SEQ_COUNT: {
ret = lzma_vli_decode(&index_hash->remaining,
&index_hash->pos, in, in_pos, in_size);
if (ret != LZMA_STREAM_END)
goto out;
// The count must match the count of the Blocks decoded.
if (index_hash->remaining != index_hash->blocks.count)
return LZMA_DATA_ERROR;
ret = LZMA_OK;
index_hash->pos = 0;
// Handle the special case when there are no Blocks.
index_hash->sequence = index_hash->remaining == 0
? SEQ_PADDING_INIT : SEQ_UNPADDED;
break;
}
case SEQ_UNPADDED:
case SEQ_UNCOMPRESSED: {
lzma_vli *size = index_hash->sequence == SEQ_UNPADDED
? &index_hash->unpadded_size
: &index_hash->uncompressed_size;
ret = lzma_vli_decode(size, &index_hash->pos,
in, in_pos, in_size);
if (ret != LZMA_STREAM_END)
goto out;
ret = LZMA_OK;
index_hash->pos = 0;
if (index_hash->sequence == SEQ_UNPADDED) {
if (index_hash->unpadded_size < UNPADDED_SIZE_MIN
|| index_hash->unpadded_size
> UNPADDED_SIZE_MAX)
return LZMA_DATA_ERROR;
index_hash->sequence = SEQ_UNCOMPRESSED;
} else {
// Update the hash.
return_if_error(hash_append(&index_hash->records,
index_hash->unpadded_size,
index_hash->uncompressed_size));
// Verify that we don't go over the known sizes. Note
// that this validation is simpler than the one used
// in lzma_index_hash_append(), because here we know
// that values in index_hash->blocks are already
// validated and we are fine as long as we don't
// exceed them in index_hash->records.
if (index_hash->blocks.blocks_size
< index_hash->records.blocks_size
|| index_hash->blocks.uncompressed_size
< index_hash->records.uncompressed_size
|| index_hash->blocks.index_list_size
< index_hash->records.index_list_size)
return LZMA_DATA_ERROR;
// Check if this was the last Record.
index_hash->sequence = --index_hash->remaining == 0
? SEQ_PADDING_INIT : SEQ_UNPADDED;
}
break;
}
case SEQ_PADDING_INIT:
index_hash->pos = (LZMA_VLI_C(4) - index_size_unpadded(
index_hash->records.count,
index_hash->records.index_list_size)) & 3;
index_hash->sequence = SEQ_PADDING;
// Fall through
case SEQ_PADDING:
if (index_hash->pos > 0) {
--index_hash->pos;
if (in[(*in_pos)++] != 0x00)
return LZMA_DATA_ERROR;
break;
}
// Compare the sizes.
if (index_hash->blocks.blocks_size
!= index_hash->records.blocks_size
|| index_hash->blocks.uncompressed_size
!= index_hash->records.uncompressed_size
|| index_hash->blocks.index_list_size
!= index_hash->records.index_list_size)
return LZMA_DATA_ERROR;
// Finish the hashes and compare them.
lzma_check_finish(&index_hash->blocks.check, LZMA_CHECK_BEST);
lzma_check_finish(&index_hash->records.check, LZMA_CHECK_BEST);
if (memcmp(index_hash->blocks.check.buffer.u8,
index_hash->records.check.buffer.u8,
lzma_check_size(LZMA_CHECK_BEST)) != 0)
return LZMA_DATA_ERROR;
// Finish the CRC32 calculation.
index_hash->crc32 = lzma_crc32(in + in_start,
*in_pos - in_start, index_hash->crc32);
index_hash->sequence = SEQ_CRC32;
// Fall through
case SEQ_CRC32:
do {
if (*in_pos == in_size)
return LZMA_OK;
if (((index_hash->crc32 >> (index_hash->pos * 8))
& 0xFF) != in[(*in_pos)++])
return LZMA_DATA_ERROR;
} while (++index_hash->pos < 4);
return LZMA_STREAM_END;
default:
assert(0);
return LZMA_PROG_ERROR;
}
out:
// Update the CRC32,
index_hash->crc32 = lzma_crc32(in + in_start,
*in_pos - in_start, index_hash->crc32);
return ret;
}
回到上一层,状态SEQ_STREAM_FOOTER,代码如下。
case SEQ_STREAM_FOOTER: {
// Copy the Stream Footer to the internal buffer.
lzma_bufcpy(in, in_pos, in_size, coder->buffer, &coder->pos,
LZMA_STREAM_HEADER_SIZE);
// Return if we didn't get the whole Stream Footer yet.
if (coder->pos < LZMA_STREAM_HEADER_SIZE)
return LZMA_OK;
coder->pos = 0;
// Decode the Stream Footer. The decoder gives
// LZMA_FORMAT_ERROR if the magic bytes don't match,
// so convert that return code to LZMA_DATA_ERROR.
lzma_stream_flags footer_flags;
lzma_stream_footer_decode用于解码footer,解码flags并设置backward_size。校验footer size,并返回结果。
const lzma_ret ret = lzma_stream_footer_decode(
&footer_flags, coder->buffer);
if (ret != LZMA_OK)
return ret == LZMA_FORMAT_ERROR
? LZMA_DATA_ERROR : ret;
// Check that Index Size stored in the Stream Footer matches
// the real size of the Index field.
if (lzma_index_hash_size(coder->index_hash)
!= footer_flags.backward_size)
return LZMA_DATA_ERROR;
// Compare that the Stream Flags fields are identical in
// both Stream Header and Stream Footer.
return_if_error(lzma_stream_flags_compare(
&coder->stream_flags, &footer_flags));
if (!coder->concatenated)
return LZMA_STREAM_END;
coder->sequence = SEQ_STREAM_PADDING;
}
// Fall through
然后是SEQ_STREAM_PADDING状态。这个状态也没有什么新奇的东西,就是检查要多少个字节的padding。
case SEQ_STREAM_PADDING:
assert(coder->concatenated);
// Skip over possible Stream Padding.
while (true) {
if (*in_pos >= in_size) {
// Unless LZMA_FINISH was used, we cannot
// know if there's more input coming later.
if (action != LZMA_FINISH)
return LZMA_OK;
// Stream Padding must be a multiple of
// four bytes.
return coder->pos == 0
? LZMA_STREAM_END
: LZMA_DATA_ERROR;
}
// If the byte is not zero, it probably indicates
// beginning of a new Stream (or the file is corrupt).
if (in[*in_pos] != 0x00)
break;
++*in_pos;
coder->pos = (coder->pos + 1) & 3;
}
// Stream Padding must be a multiple of four bytes (empty
// Stream Padding is OK).
if (coder->pos != 0) {
++*in_pos;
return LZMA_DATA_ERROR;
}
// Prepare to decode the next Stream.
return_if_error(stream_decoder_reset(coder, allocator));
break;
default:
assert(0);
return LZMA_PROG_ERROR;
}
// Never reached
}