libwebp: Update to 1.5.0

This commit is contained in:
Rémi Verschelde 2025-01-09 15:17:51 +01:00
parent 1f787b63a5
commit 01f88ff138
No known key found for this signature in database
GPG Key ID: C3336907360768E1
89 changed files with 3131 additions and 2143 deletions

View File

@ -582,7 +582,7 @@ Files extracted from upstream source:
## libwebp
- Upstream: https://chromium.googlesource.com/webm/libwebp/
- Version: 1.4.0 (845d5476a866141ba35ac133f856fa62f0b7445f, 2024)
- Version: 1.5.0 (a4d7a715337ded4451fec90ff8ce79728e04126c, 2024)
- License: BSD-3-Clause
Files extracted from upstream source:

View File

@ -11,11 +11,13 @@ Contributors:
- Christopher Degawa (ccom at randomderp dot com)
- Clement Courbet (courbet at google dot com)
- Djordje Pesut (djordje dot pesut at imgtec dot com)
- Frank (1433351828 at qq dot com)
- Frank Barchard (fbarchard at google dot com)
- Hui Su (huisu at google dot com)
- H. Vetinari (h dot vetinari at gmx dot com)
- Ilya Kurdyukov (jpegqs at gmail dot com)
- Ingvar Stepanyan (rreverser at google dot com)
- Istvan Stefan (Istvan dot Stefan at arm dot com)
- James Zern (jzern at google dot com)
- Jan Engelhardt (jengelh at medozas dot de)
- Jehan (jehan at girinstud dot io)
@ -62,6 +64,7 @@ Contributors:
- Vincent Rabaud (vrabaud at google dot com)
- Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
- Wan-Teh Chang (wtc at google dot com)
- wrv (wrv at utexas dot edu)
- Yang Zhang (yang dot zhang at arm dot com)
- Yannis Guyon (yguyon at google dot com)
- Zhi An Ng (zhin at chromium dot org)

View File

@ -565,10 +565,11 @@ int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
v_ptr, v_stride, yuv_bit_depth, width, height,
&scaled_matrix, transfer_type);
return DoSharpArgbToYuv(
(const uint8_t*)r_ptr, (const uint8_t*)g_ptr, (const uint8_t*)b_ptr,
rgb_step, rgb_stride, rgb_bit_depth, (uint8_t*)y_ptr, y_stride,
(uint8_t*)u_ptr, u_stride, (uint8_t*)v_ptr, v_stride, yuv_bit_depth,
width, height, &scaled_matrix, transfer_type);
}
//------------------------------------------------------------------------------

View File

@ -52,7 +52,7 @@ extern "C" {
// SharpYUV API version following the convention from semver.org
#define SHARPYUV_VERSION_MAJOR 0
#define SHARPYUV_VERSION_MINOR 4
#define SHARPYUV_VERSION_PATCH 0
#define SHARPYUV_VERSION_PATCH 1
// Version as a uint32_t. The major number is the high 8 bits.
// The minor number is the middle 8 bits. The patch number is the low 16 bits.
#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
@ -66,10 +66,17 @@ extern "C" {
SHARPYUV_EXTERN int SharpYuvGetVersion(void);
// RGB to YUV conversion matrix, in 16 bit fixed point.
// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
// Then y, u and v values are divided by 1<<16 and rounded.
// y_ = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
// u_ = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
// v_ = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
// Then the values are divided by 1<<16 and rounded.
// y = (y_ + (1 << 15)) >> 16
// u = (u_ + (1 << 15)) >> 16
// v = (v_ + (1 << 15)) >> 16
//
// Typically, the offset values rgb_to_y[3], rgb_to_u[3] and rgb_to_v[3] depend
// on the input's bit depth, e.g., rgb_to_u[3] = 1 << (rgb_bit_depth - 1 + 16).
// See also sharpyuv_csp.h to get a predefined matrix or generate a matrix.
typedef struct {
int rgb_to_y[4];
int rgb_to_u[4];
@ -127,6 +134,8 @@ typedef enum SharpYuvTransferFunctionType {
// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
// should be multiples of 2.
// width, height: width and height of the image in pixels
// yuv_matrix: RGB to YUV conversion matrix. The matrix values typically
// depend on the input's rgb_bit_depth.
// This function calls SharpYuvConvertWithOptions with a default transfer
// function of kSharpYuvTransferFunctionSrgb.
SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,

View File

@ -22,16 +22,16 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
const float kr = yuv_color_space->kr;
const float kb = yuv_color_space->kb;
const float kg = 1.0f - kr - kb;
const float cr = 0.5f / (1.0f - kb);
const float cb = 0.5f / (1.0f - kr);
const float cb = 0.5f / (1.0f - kb);
const float cr = 0.5f / (1.0f - kr);
const int shift = yuv_color_space->bit_depth - 8;
const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
float scale_y = 1.0f;
float add_y = 0.0f;
float scale_u = cr;
float scale_v = cb;
float scale_u = cb;
float scale_v = cr;
float add_uv = (float)(128 << shift);
assert(yuv_color_space->bit_depth >= 8);
@ -59,31 +59,35 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
}
// Matrices are in YUV_FIX fixed point precision.
// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
// WebP's matrix, similar but not identical to kRec601LimitedMatrix
// Derived using the following formulas:
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
static const SharpYuvConversionMatrix kWebpMatrix = {
{16839, 33059, 6420, 16 << 16},
{-9719, -19081, 28800, 128 << 16},
{28800, -24116, -4684, 128 << 16},
};
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
{16829, 33039, 6416, 16 << 16},
{-9714, -19071, 28784, 128 << 16},
{28784, -24103, -4681, 128 << 16},
};
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec601FullMatrix = {
{19595, 38470, 7471, 0},
{-11058, -21710, 32768, 128 << 16},
{32768, -27439, -5329, 128 << 16},
};
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
{11966, 40254, 4064, 16 << 16},
{-6596, -22189, 28784, 128 << 16},
{28784, -26145, -2639, 128 << 16},
};
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec709FullMatrix = {
{13933, 46871, 4732, 0},
{-7509, -25259, 32768, 128 << 16},

View File

@ -41,10 +41,15 @@ SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
// Enums for precomputed conversion matrices.
typedef enum {
// WebP's matrix, similar but not identical to kSharpYuvMatrixRec601Limited
kSharpYuvMatrixWebp = 0,
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
kSharpYuvMatrixRec601Limited,
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
kSharpYuvMatrixRec601Full,
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
kSharpYuvMatrixRec709Limited,
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
kSharpYuvMatrixRec709Full,
kSharpYuvMatrixNum
} SharpYuvMatrixType;

View File

@ -16,7 +16,8 @@
#include "src/utils/bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE)
#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64
#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64 && \
!defined(__wasm__)
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else

View File

@ -32,7 +32,7 @@ extern "C" {
// version numbers
#define DEC_MAJ_VERSION 1
#define DEC_MIN_VERSION 4
#define DEC_MIN_VERSION 5
#define DEC_REV_VERSION 0
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).

View File

@ -20,10 +20,9 @@
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
#include "src/utils/endian_inl_utils.h"
#include "src/utils/huffman_utils.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
#define NUM_ARGB_CACHE_ROWS 16
@ -381,7 +380,8 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
if (allow_recursion && VP8LReadBits(br, 1)) {
// use meta Huffman codes.
const int huffman_precision = VP8LReadBits(br, 3) + 2;
const int huffman_precision =
MIN_HUFFMAN_BITS + VP8LReadBits(br, NUM_HUFFMAN_BITS);
const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
const int huffman_pixs = huffman_xsize * huffman_ysize;
@ -1351,7 +1351,8 @@ static int ReadTransform(int* const xsize, int const* ysize,
switch (type) {
case PREDICTOR_TRANSFORM:
case CROSS_COLOR_TRANSFORM:
transform->bits_ = VP8LReadBits(br, 3) + 2;
transform->bits_ =
MIN_TRANSFORM_BITS + VP8LReadBits(br, NUM_TRANSFORM_BITS);
ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
transform->bits_),
VP8LSubSampleSize(transform->ysize_,
@ -1416,7 +1417,9 @@ VP8LDecoder* VP8LNew(void) {
return dec;
}
void VP8LClear(VP8LDecoder* const dec) {
// Resets the decoder in its initial state, reclaiming memory.
// Preserves the dec->status_ value.
static void VP8LClear(VP8LDecoder* const dec) {
int i;
if (dec == NULL) return;
ClearMetadata(&dec->hdr_);

View File

@ -121,10 +121,6 @@ WEBP_NODISCARD int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
// this function. Returns false in case of error, with updated dec->status_.
WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec);
// Resets the decoder in its initial state, reclaiming memory.
// Preserves the dec->status_ value.
void VP8LClear(VP8LDecoder* const dec);
// Clears and deallocate a lossless decoder instance.
void VP8LDelete(VP8LDecoder* const dec);

View File

@ -24,7 +24,7 @@
#include "src/webp/format_constants.h"
#define DMUX_MAJ_VERSION 1
#define DMUX_MIN_VERSION 4
#define DMUX_MIN_VERSION 5
#define DMUX_REV_VERSION 0
typedef struct {

View File

@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
return cost;
}
static void SetResidualCoeffs_C(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_C(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* WEBP_RESTRICT const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);

View File

@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
return cost;
}
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_MIPS32(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* WEBP_RESTRICT const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);

View File

@ -19,8 +19,8 @@
static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16 };
static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_NEON(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* WEBP_RESTRICT const res) {
const int16x8_t minus_one = vdupq_n_s16(-1);
const int16x8_t coeffs_0 = vld1q_s16(coeffs);
const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);

View File

@ -22,8 +22,8 @@
//------------------------------------------------------------------------------
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_SSE2(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* WEBP_RESTRICT const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.

View File

@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
} while (0)
#if !WEBP_NEON_OMIT_C_CODE
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
}
// Simplified transform when only in[0], in[1] and in[4] are non-zero
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4;
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
}
#undef STORE2
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne_C(in, dst);
if (do_two) {
TransformOne_C(in + 16, dst + 4);
@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
#if !WEBP_NEON_OMIT_C_CODE
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
// Paragraph 14.3
#if !WEBP_NEON_OMIT_C_CODE
static void TransformWHT_C(const int16_t* in, int16_t* out) {
static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
VP8WHT VP8TransformWHT;
//------------------------------------------------------------------------------
// Intra predictions
@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,
#if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
//------------------------------------------------------------------------------
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
int i, j;
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) {
@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
VP8SimpleFilterFunc VP8SimpleVFilter16i;
VP8SimpleFilterFunc VP8SimpleHFilter16i;
void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
int dst_stride);
void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
uint8_t* WEBP_RESTRICT dst, int dst_stride);
extern VP8CPUInfo VP8GetCPUInfo;
extern void VP8DspInitSSE2(void);

View File

@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
}
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14;
@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);

View File

@ -21,7 +21,8 @@
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
static void TransformDC(const int16_t* in, uint8_t* dst) {
static void TransformDC(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
__asm__ volatile (
@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
);
}
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4;
int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
);
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride,
}
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}

View File

@ -38,7 +38,8 @@
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3;
@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
static void TransformWHT(const int16_t* in, int16_t* out) {
static void TransformWHT(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
v8i16 input0, input1;
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
out[240] = __msa_copy_s_h(out1, 7);
}
static void TransformDC(const int16_t* in, uint8_t* dst) {
static void TransformDC(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int DC = (in[0] + 4) >> 3;
const v8i16 tmp0 = __msa_fill_h(DC);
ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
}
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4;
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
}
// 8-pixels wide variants, for chroma filtering
static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
int stride, int b_limit_in, int limit_in, int thresh_in) {
uint8_t* ptmp_src_u = src_u - 4 * stride;
uint8_t* ptmp_src_v = src_v - 4 * stride;
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
SD(q2_d, ptmp_src_v);
}
static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
int stride, int b_limit_in, int limit_in, int thresh_in) {
uint8_t* ptmp_src_u = src_u - 4;
uint8_t* ptmp_src_v = src_v - 4;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
}
static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
uint8_t* WEBP_RESTRICT src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
uint64_t p1_d, p0_d, q0_d, q1_d;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
}
static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
uint8_t* WEBP_RESTRICT src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;

View File

@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
#endif // !WORK_AROUND_GCC
// 8-pixels wide variant, for chroma filtering
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
}
}
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4 * stride;
@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
}
#if !defined(WORK_AROUND_GCC)
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
}
}
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4;
@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
Transpose8x2_NEON(E0, E1, rows);
}
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows);
@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
#else
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int kBPS = BPS;
// kC1, kC2. Padded because vld1.16 loads 8 bytes
const int16_t constants[4] = { kC1, kC2, 0, 0 };
@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
#endif // WEBP_USE_INTRINSICS
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne_NEON(in, dst);
if (do_two) {
TransformOne_NEON(in + 16, dst + 4);
}
}
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int16x8_t DC = vdupq_n_s16(in[0]);
Add4x4_NEON(DC, DC, dst);
}
@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
} while (0)
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int32x4x4_t tmp;
{
@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int16x4_t A = vld1_dup_s16(in);
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@ -1300,18 +1308,19 @@ static void DC4_NEON(uint8_t* dst) { // DC
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
const uint16x8_t d = vsubl_u8(T, TL); // A[c] - A[-1]
int y;
for (y = 0; y < size; y += 4) {
// left edge
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
const int16x8_t r1 = vaddq_s16(L1, d);
const int16x8_t r2 = vaddq_s16(L2, d);
const int16x8_t r3 = vaddq_s16(L3, d);
const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
// L[r] + A[c] - A[-1]
const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
// Saturate and store the result.
const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
@ -1572,23 +1581,24 @@ static void TM16_NEON(uint8_t* dst) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
// A[c] - A[-1]
const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
int y;
for (y = 0; y < 16; y += 4) {
// left edge
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
// L[r] + A[c] - A[-1]
const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
// Saturate and store the result.
const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));

View File

@ -30,7 +30,8 @@
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
#if (USE_TRANSFORM_AC3 == 1)
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const __m128i A = _mm_set1_epi16(in[0] + 4);
const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride);
}
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
}
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride);
}
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;

View File

@ -60,53 +60,66 @@ extern "C" {
// Transforms
// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
// will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two);
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two);
typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out);
typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out);
extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform;
extern VP8Fdct VP8FTransform2; // performs two transforms at a time
extern VP8WHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top and *left can be NULL.
typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
const uint8_t* top);
typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top);
typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top);
extern VP8Intra4Preds VP8EncPredLuma4;
extern VP8IntraPreds VP8EncPredLuma16;
extern VP8IntraPreds VP8EncPredChroma8;
typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix,
const uint8_t* WEBP_RESTRICT ref);
extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
const uint16_t* const weights);
typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix,
const uint8_t* WEBP_RESTRICT ref,
const uint16_t* WEBP_RESTRICT const weights);
// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
// 4 by 4 symmetric matrix.
extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
// Compute the average (DC) of four 4x4 blocks.
// Each sub-4x4 block #i sum is stored in dc[i].
typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref,
uint32_t dc[4]);
extern VP8MeanMetric VP8Mean16x4;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst);
extern VP8BlockCopy VP8Copy4x4;
extern VP8BlockCopy VP8Copy16x8;
// Quantization
struct VP8Matrix; // forward declaration
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
const struct VP8Matrix* const mtx);
typedef int (*VP8QuantizeBlock)(
int16_t in[16], int16_t out[16],
const struct VP8Matrix* WEBP_RESTRICT const mtx);
// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
const struct VP8Matrix* const mtx);
typedef int (*VP8Quantize2Blocks)(
int16_t in[32], int16_t out[32],
const struct VP8Matrix* WEBP_RESTRICT const mtx);
extern VP8QuantizeBlock VP8EncQuantizeBlock;
extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
// specific to 2nd transform:
typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
const struct VP8Matrix* const mtx);
typedef int (*VP8QuantizeBlockWHT)(
int16_t in[16], int16_t out[16],
const struct VP8Matrix* WEBP_RESTRICT const mtx);
extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
extern const int VP8DspScan[16 + 4 + 4];
@ -118,9 +131,10 @@ typedef struct {
int max_value;
int last_non_zero;
} VP8Histogram;
typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
typedef void (*VP8CHisto)(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block,
VP8Histogram* const histo);
VP8Histogram* WEBP_RESTRICT const histo);
extern VP8CHisto VP8CollectHistogram;
// General-purpose util function to help VP8CollectHistogram().
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
@ -138,8 +152,9 @@ extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
extern const uint8_t VP8EncBands[16 + 1];
struct VP8Residual;
typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
struct VP8Residual* const res);
typedef void (*VP8SetResidualCoeffsFunc)(
const int16_t* WEBP_RESTRICT const coeffs,
struct VP8Residual* WEBP_RESTRICT const res);
extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
// Cost calculation function.
@ -193,9 +208,11 @@ void VP8SSIMDspInit(void);
//------------------------------------------------------------------------------
// Decoding
typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
uint8_t* WEBP_RESTRICT dst);
// when doing two transforms, coeffs is actually int16_t[2][16].
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
uint8_t* WEBP_RESTRICT dst, int do_two);
extern VP8DecIdct2 VP8Transform;
extern VP8DecIdct VP8TransformAC3;
extern VP8DecIdct VP8TransformUV;
@ -233,7 +250,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
// regular filter (on both macroblock edges and inner edges)
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
int thresh, int ithresh, int hev_t);
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int stride,
int thresh, int ithresh, int hev_t);
// on outer edge
extern VP8LumaFilterFunc VP8VFilter16;
@ -253,8 +271,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
#define VP8_DITHER_AMP_BITS 7
#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
int dst_stride);
extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
uint8_t* WEBP_RESTRICT dst, int dst_stride);
// must be called before anything using the above
void VP8DspInit(void);
@ -267,10 +285,10 @@ void VP8DspInit(void);
// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
// bottom_y can be NULL if only one line of output is needed (at top/bottom).
typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len);
const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);
#ifdef FANCY_UPSAMPLING
@ -280,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
#endif // FANCY_UPSAMPLING
// Per-row point-sampling methods.
typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride,
uint8_t* dst, int dst_stride,
void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func);
// Sampling functions to convert rows of YUV to RGB(A)
@ -298,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
// YUV444->RGB converters
typedef void (*WebPYUV444Converter)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
@ -316,26 +337,35 @@ void WebPInitYUV444Converters(void);
// ARGB -> YUV converters
// Convert ARGB samples to luma Y.
extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width);
// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
// even lines and '0' for odd ones. 'src_width' is the original width, not
// the U/V one.
extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store);
// Convert a row of accumulated (four-values) of rgba32 toward U/V
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
// Convert RGB or BGR to Y
extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width);
extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width);
// used for plain-C fallback.
extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store);
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
// Must be called before using the above.
void WebPInitConvertARGBToYUV(void);
@ -348,8 +378,9 @@ struct WebPRescaler;
// Import a row of data and save its contribution in the rescaler.
// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
const uint8_t* src);
typedef void (*WebPRescalerImportRowFunc)(
struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
@ -362,16 +393,19 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
// Plain-C implementation, as fall-back.
extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowExpand_C(
struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
extern void WebPRescalerImportRowShrink_C(
struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
// Main entry calls:
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRow(
struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
// Export one row (starting at x_out position) from rescaler.
extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
@ -480,8 +514,9 @@ typedef enum { // Filter types.
WEBP_FILTER_FAST
} WEBP_FILTER_TYPE;
typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
int stride, uint8_t* out);
typedef void (*WebPFilterFunc)(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT out);
// In-place un-filtering.
// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,

View File

@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
}
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block,
VP8Histogram* const histo) {
VP8Histogram* WEBP_RESTRICT const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@ -109,8 +110,9 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
}
}
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
}
}
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
// input is 12b signed
int32_t tmp[16];
int i;
@ -234,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
}
}
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top,
int size) {
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@ -244,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
}
}
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
int size) {
if (left != NULL) {
int j;
for (j = 0; j < size; ++j) {
@ -256,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
}
}
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top, int size) {
int y;
if (left != NULL) {
if (top != NULL) {
@ -286,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top,
static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top,
int size, int round, int shift) {
int DC = 0;
int j;
@ -312,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
@ -332,22 +345,28 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_C(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
}
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
//------------------------------------------------------------------------------
// luma 4x4 prediction
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
// vertical
static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]),
@ -360,7 +379,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
}
}
static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
// horizontal
static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -372,14 +392,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static void DC4(uint8_t* dst, const uint8_t* top) {
static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
}
static void RD4(uint8_t* dst, const uint8_t* top) {
static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -398,7 +418,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst, const uint8_t* top) {
static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
@ -416,7 +436,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
DST(3, 3) = AVG3(G, H, H);
}
static void VR4(uint8_t* dst, const uint8_t* top) {
static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -438,7 +458,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst, const uint8_t* top) {
static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
@ -460,7 +480,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst, const uint8_t* top) {
static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
@ -475,7 +495,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst, const uint8_t* top) {
static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -498,7 +518,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J);
}
static void TM4(uint8_t* dst, const uint8_t* top) {
static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int x, y;
const uint8_t* const clip = clip1 + 255 - top[-1];
for (y = 0; y < 4; ++y) {
@ -516,7 +536,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -529,11 +550,14 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
HU4(I4HU4 + dst, top);
}
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
//------------------------------------------------------------------------------
// Metric
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b,
int w, int h) {
int count = 0;
int y, x;
@ -548,21 +572,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
return count;
}
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 4, 4);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
@ -586,7 +614,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* in, const uint16_t* w) {
static int TTransform(const uint8_t* WEBP_RESTRICT in,
const uint16_t* WEBP_RESTRICT w) {
int sum = 0;
int tmp[16];
int i;
@ -620,15 +649,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -644,13 +675,14 @@ static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
// Quantization
//
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
// Simple quantization
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
@ -675,9 +707,8 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
return (last >= 0);
}
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -688,7 +719,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
//------------------------------------------------------------------------------
// Block copy
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst, int w, int h) {
int y;
for (y = 0; y < h; ++y) {
memcpy(dst, src, w);
@ -697,11 +729,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
}
}
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst) {
Copy(src, dst, 16, 8);
}
@ -760,14 +794,19 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
VP8EncPredLuma4 = Intra4Preds_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
VP8EncPredLuma16 = Intra16Preds_C;
#endif
VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C;

View File

@ -109,9 +109,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
const int16_t* in,
uint8_t* dst) {
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -141,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
);
}
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne_MIPS32(ref, in, dst);
if (do_two) {
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
@ -236,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
}
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -358,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -393,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -475,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
@ -537,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -571,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -597,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -619,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

View File

@ -141,8 +141,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const int c2217 = 2217;
const int c5352 = 5352;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
return abs(temp3 - temp17) >> 5;
}
static int Disto16x16_MIPSdspR2(const uint8_t* const a,
const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a,
} while (0)
#define VERTICAL_PRED(DST, TOP, SIZE) \
static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \
const uint8_t* (TOP)) { \
static WEBP_INLINE void VerticalPred##SIZE( \
uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) { \
int j; \
if ((TOP)) { \
for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \
@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16)
#undef VERTICAL_PRED
#define HORIZONTAL_PRED(DST, LEFT, SIZE) \
static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \
const uint8_t* (LEFT)) { \
static WEBP_INLINE void HorizontalPred##SIZE( \
uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) { \
if (LEFT) { \
int j; \
for (j = 0; j < (SIZE); ++j) { \
@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16)
} while (0)
#define TRUE_MOTION(DST, LEFT, TOP, SIZE) \
static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
const uint8_t* (TOP)) { \
static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST), \
const uint8_t* WEBP_RESTRICT (LEFT), \
const uint8_t* WEBP_RESTRICT (TOP)) { \
if ((LEFT) != NULL) { \
if ((TOP) != NULL) { \
CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \
@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16)
#undef CLIP_8B_TO_DST
#undef CLIPPING
static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
int DC, DC1;
int temp0, temp1, temp2, temp3;
@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
FILL_8_OR_16(dst, DC, 16);
}
static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
int DC, DC1;
int temp0, temp1, temp2, temp3;
@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
FILL_8_OR_16(dst, DC, 8);
}
static void DC4(uint8_t* dst, const uint8_t* top) {
static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1;
__asm__ volatile(
"ulw %[temp0], 0(%[top]) \n\t"
@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) {
);
}
static void TM4(uint8_t* dst, const uint8_t* top) {
static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
const int c35 = 0xff00ff;
__asm__ volatile (
@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
);
}
static void VE4(uint8_t* dst, const uint8_t* top) {
static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile(
"ulw %[temp0], -1(%[top]) \n\t"
@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) {
);
}
static void HE4(uint8_t* dst, const uint8_t* top) {
static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile(
"ulw %[temp0], -4(%[top]) \n\t"
@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) {
);
}
static void RD4(uint8_t* dst, const uint8_t* top) {
static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int temp6, temp7, temp8, temp9, temp10, temp11;
__asm__ volatile(
@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
);
}
static void VR4(uint8_t* dst, const uint8_t* top) {
static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
);
}
static void LD4(uint8_t* dst, const uint8_t* top) {
static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int temp6, temp7, temp8, temp9, temp10, temp11;
__asm__ volatile(
@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
);
}
static void VL4(uint8_t* dst, const uint8_t* top) {
static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
);
}
static void HD4(uint8_t* dst, const uint8_t* top) {
static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
);
}
static void HU4(uint8_t* dst, const uint8_t* top) {
static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile (
"ulw %[temp0], -5(%[top]) \n\t"
@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block
DCMode8(C8DC8 + dst, left, top);
VerticalPred8(C8VE8 + dst, top);
@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_MIPSdspR2(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode16(I16DC16 + dst, left, top);
VerticalPred16(I16VE16 + dst, top);
HorizontalPred16(I16HE16 + dst, left);
@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst,
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
GET_SSE_INNER(C) \
GET_SSE_INNER(D)
static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
"3: \n\t"
static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
int sign, coeff, level;
int max_level = MAX_LEVEL;
@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
}
static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
"usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
"usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;

View File

@ -41,8 +41,9 @@
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
} while (0)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3;
@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
SD4(out0, out1, out2, out3, out, 8);
}
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
ST_SH2(out0, out1, out, 8);
}
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in,
const uint16_t* WEBP_RESTRICT w) {
int sum;
uint32_t in0_m, in1_m, in2_m, in3_m;
v16i8 src0 = { 0 };
@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
return sum;
}
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int sum1 = TTransform_MSA(a, w);
const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
// vertical
static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
SW4(out, out, out, out, dst, BPS);
}
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
// horizontal
static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
SW4(dc, dc, dc, dc, dst, BPS);
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
SW4(val3, val2, val1, val0, dst, BPS);
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
SW4(val0, val1, val2, val3, dst, BPS);
}
static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
DST(3, 1) = AVG3(B, C, D);
}
static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
DST(3, 3) = AVG3(F, G, H);
}
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J);
}
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16i8 zero = { 0 };
const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
#undef AVG3
#undef AVG2
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
} while (0)
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) {
const v16u8 out = LD_UB(top);
STORE16x16(out, dst);
@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
const uint8_t* left) {
static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
if (left != NULL) {
int j;
for (j = 0; j < 16; j += 4) {
@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
}
}
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (left != NULL) {
if (top != NULL) {
int j;
@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
int DC;
v16u8 out;
if (top != NULL && left != NULL) {
@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
STORE16x16(out, dst);
}
static void Intra16Preds_MSA(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left);
@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst,
SD4(out, out, out, out, dst + 4 * BPS, BPS); \
} while (0)
static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) {
const uint64_t out = LD(top);
STORE8x8(out, dst);
@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
if (left != NULL) {
int j;
for (j = 0; j < 8; j += 4) {
@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (left != NULL) {
if (top != NULL) {
int j;
@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
uint64_t out;
v16u8 src = { 0 };
if (top != NULL && left != NULL) {
@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
STORE8x8(out, dst);
}
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
// Quantization
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int sum;
v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
}
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;

View File

@ -60,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23,
const uint8_t* const ref,
uint8_t* const dst) {
const uint8_t* WEBP_RESTRICT const ref,
uint8_t* WEBP_RESTRICT const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
@ -120,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
Transpose8x2_NEON(E0, E1, rows);
}
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows);
@ -131,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
#else
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
@ -247,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
#endif // WEBP_USE_INTRINSICS
static void ITransform_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne_NEON(ref, in, dst);
if (do_two) {
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
@ -294,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
const uint8x16_t S0 = Load4x4_NEON(src);
@ -364,8 +368,9 @@ static const int32_t kCoeff32[] = {
51000, 51000, 51000, 51000
};
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
@ -484,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
src += stride; \
} while (0)
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
int16_t* WEBP_RESTRICT out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
@ -659,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -701,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
}
#undef LOAD_LANE_32b
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -715,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block,
VP8Histogram* const histo) {
VP8Histogram* WEBP_RESTRICT const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -747,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
//------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
static WEBP_INLINE void AccumulateSSE16_NEON(
const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -775,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) {
#endif
}
static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
@ -784,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
return SumToInt_NEON(sum);
}
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
@ -793,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
return SumToInt_NEON(sum);
}
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
@ -806,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
return SumToInt_NEON(sum);
}
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
const uint8x16_t a0 = Load4x4_NEON(a);
const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -825,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize_NEON(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
const VP8Matrix* WEBP_RESTRICT const mtx,
int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -860,7 +874,7 @@ static const uint8_t kShuffles[4][8] = {
};
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles;
@ -902,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
}
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -911,6 +925,271 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
#endif // !WORK_AROUND_GCC
#if WEBP_AARCH64
#if BPS == 32
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
do { \
uint8x16_t r; \
r = vqtbl2q_u8(qcombined, tbl); \
r = vreinterpretq_u8_u32( \
vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \
vreinterpretq_u32_u8(r), 1)); \
vst1q_u8(dst, r); \
} while (0)
#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \
do { \
uint8x16_t r; \
r = vqtbl2q_u8(qcombined, tbl); \
vst1q_u8(dst, r); \
} while (0)
static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
// L K J I X A B C D E F G H
// -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
static const uint8_t kLookupTbl1[64] = {
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,
3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,
4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,
2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31
};
static const uint8_t kLookupTbl2[64] = {
20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,
19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,
17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
};
static const uint8_t kLookupTbl3[64] = {
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16
};
const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
const uint8x16_t preload = vld1q_u8(top - 5);
uint8x16x2_t qcombined;
uint8x16_t result0, result1;
uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
uint8x16_t b = preload;
uint8x16_t c = vextq_u8(a, a, 2);
uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
uint8x16_t avg2_all = vrhaddq_u8(a, b);
uint8x8_t preload_x8, sub_a, sub_c;
uint8_t result_u8;
uint8x8_t res_lo, res_hi;
uint8x16_t full_b;
uint16x8_t sub, sum_lo, sum_hi;
preload_x8 = vget_low_u8(c);
preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
qcombined.val[0] = avg2_all;
qcombined.val[1] = avg3_all;
sub_a = vdup_laneq_u8(preload, 4);
// preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
// preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
sub_c = vreinterpret_u8_u32(vdup_n_u32(
vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
sub = vsubl_u8(sub_c, sub_a);
sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
// DC4, VE4, HE4, TM4
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
// RD4, VR4, LD4, VL4
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
// HD4, HU4
result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
}
#endif // BPS == 32
static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
uint8x16_t a = vdupq_n_u8(value);
int i;
for (i = 0; i < 16; i++) {
vst1q_u8(dst + BPS * i, a);
}
}
static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
uint8x16_t a = vld1q_u8(src);
int i;
for (i = 0; i < 16; i++) {
vst1q_u8(dst + BPS * i, a);
}
}
static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
const uint8_t* left) {
uint8x16_t a;
if (left == NULL) {
Fill_NEON(dst, 129);
return;
}
a = vld1q_u8(left + 0);
vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
}
static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
if (top != NULL) {
Fill16_NEON(dst, top);
} else {
Fill_NEON(dst, 127);
}
}
static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint8_t s;
if (top != NULL) {
uint16_t dc;
dc = vaddlvq_u8(vld1q_u8(top));
if (left != NULL) {
// top and left present.
dc += vaddlvq_u8(vld1q_u8(left));
s = vqrshrnh_n_u16(dc, 5);
} else {
// top but no left.
s = vqrshrnh_n_u16(dc, 4);
}
} else {
if (left != NULL) {
uint16_t dc;
// left but no top.
dc = vaddlvq_u8(vld1q_u8(left));
s = vqrshrnh_n_u16(dc, 4);
} else {
// No top, no left, nothing.
s = 0x80;
}
}
Fill_NEON(dst, s);
}
static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
const uint8x8_t outer,
const uint8x8x2_t inner,
const uint16x8_t a, int i,
const int n) {
uint8x8_t d1, d2;
uint16x8_t r1, r2;
r1 = vaddl_u8(outer, inner.val[0]);
r1 = vqsubq_u16(r1, a);
d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
r2 = vaddl_u8(outer, inner.val[1]);
r2 = vqsubq_u16(r2, a);
d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
vst1_u8(dst + BPS * (i * 4 + n), d1);
vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
}
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
int i;
uint16x8_t a;
uint8x8x2_t inner;
if (left == NULL) {
// True motion without left samples (hence: with default 129 value) is
// equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is then
// 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred16_NEON(dst, top);
} else {
Fill_NEON(dst, 129);
}
return;
}
// left is not NULL.
if (top == NULL) {
HorizontalPred16_NEON(dst, left);
return;
}
// Neither left nor top are NULL.
a = vdupq_n_u16(left[-1]);
inner = vld1_u8_x2(top);
for (i = 0; i < 4; i++) {
const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
}
}
static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode_NEON(I16DC16 + dst, left, top);
VerticalPred16_NEON(I16VE16 + dst, top);
HorizontalPred16_NEON(I16HE16 + dst, left);
TrueMotion_NEON(I16TM16 + dst, left, top);
}
#endif // WEBP_AARCH64
//------------------------------------------------------------------------------
// Entry point
@ -931,9 +1210,17 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8SSE8x8 = SSE8x8_NEON;
VP8SSE4x4 = SSE4x4_NEON;
#if WEBP_AARCH64
#if BPS == 32
VP8EncPredLuma4 = Intra4Preds_NEON;
#endif
VP8EncPredLuma16 = Intra16Preds_NEON;
#endif
#if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock_NEON;
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
#endif
}

View File

@ -26,8 +26,9 @@
// Transforms (Paragraph 14.4)
// Does one inverse transform.
static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -177,8 +178,9 @@ static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
}
// Does two inverse transforms.
static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -316,7 +318,9 @@ static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
}
// Does one or two inverse transforms.
static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst,
int do_two) {
if (do_two) {
ITransform_Two_SSE2(ref, in, dst);
@ -373,7 +377,7 @@ static void FTransformPass1_SSE2(const __m128i* const in01,
static void FTransformPass2_SSE2(const __m128i* const v01,
const __m128i* const v32,
int16_t* out) {
int16_t* WEBP_RESTRICT out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
@ -424,8 +428,9 @@ static void FTransformPass2_SSE2(const __m128i* const v01,
_mm_storeu_si128((__m128i*)&out[8], d2_f3);
}
static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const __m128i zero = _mm_setzero_si128();
// Load src.
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@ -468,8 +473,9 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
FTransformPass2_SSE2(&v01, &v32, out);
}
static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const __m128i zero = _mm_setzero_si128();
// Load src and convert to 16b.
@ -517,7 +523,8 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
FTransformPass2_SSE2(&v01h, &v32h, out + 16);
}
static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in,
__m128i* const out) {
const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@ -533,7 +540,8 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
*out = _mm_madd_epi16(D, kMult);
}
static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
// Input is 12b signed.
__m128i row0, row1, row2, row3;
// Rows are 14b signed.
@ -566,9 +574,10 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block,
VP8Histogram* const histo) {
VP8Histogram* WEBP_RESTRICT const histo) {
const __m128i zero = _mm_setzero_si128();
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
@ -640,7 +649,8 @@ static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
}
}
static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
int j;
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
for (j = 0; j < 8; ++j) {
@ -648,7 +658,8 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_values = _mm_load_si128((const __m128i*)top);
int j;
for (j = 0; j < 16; ++j) {
@ -656,8 +667,9 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
const uint8_t* top, int size) {
static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top,
int size) {
if (top != NULL) {
if (size == 8) {
VE8uv_SSE2(dst, top);
@ -669,7 +681,8 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
}
}
static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
int j;
for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8((char)left[j]);
@ -678,7 +691,8 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
int j;
for (j = 0; j < 16; ++j) {
const __m128i values = _mm_set1_epi8((char)left[j]);
@ -687,8 +701,9 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
const uint8_t* left, int size) {
static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
int size) {
if (left != NULL) {
if (size == 8) {
HE8uv_SSE2(dst, left);
@ -700,8 +715,9 @@ static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
}
}
static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top, int size) {
const __m128i zero = _mm_setzero_si128();
int y;
if (size == 8) {
@ -728,8 +744,10 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top,
int size) {
if (left != NULL) {
if (top != NULL) {
TM_SSE2(dst, left, top, size);
@ -749,8 +767,9 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
@ -758,7 +777,8 @@ static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
Put8x8uv_SSE2(DC >> 4, dst);
}
static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i sum = _mm_sad_epu8(top_values, zero);
@ -766,7 +786,8 @@ static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
Put8x8uv_SSE2(DC >> 3, dst);
}
static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
// 'left' is contiguous so we can reuse the top summation.
DC8uvNoLeft_SSE2(dst, left);
}
@ -775,8 +796,9 @@ static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
Put8x8uv_SSE2(0x80, dst);
}
static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) {
if (left != NULL) { // top and left present
DC8uv_SSE2(dst, left, top);
@ -790,8 +812,9 @@ static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const __m128i left_row = _mm_load_si128((const __m128i*)left);
const int DC =
@ -799,13 +822,15 @@ static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
Put16_SSE2(DC >> 5, dst);
}
static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const int DC = VP8HorizontalAdd8b(&top_row) + 8;
Put16_SSE2(DC >> 4, dst);
}
static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
// 'left' is contiguous so we can reuse the top summation.
DC16NoLeft_SSE2(dst, left);
}
@ -814,8 +839,9 @@ static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
Put16_SSE2(0x80, dst);
}
static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) {
if (left != NULL) { // top and left present
DC16_SSE2(dst, left, top);
@ -844,8 +870,9 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
const uint8_t* top) { // vertical
// vertical
static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -861,8 +888,9 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
}
}
static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
const uint8_t* top) { // horizontal
// horizontal
static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -874,15 +902,17 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill_SSE2(dst, dc >> 3, 4);
}
static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
const uint8_t* top) { // Down-Left
// Down-Left
static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -898,8 +928,9 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
const uint8_t* top) { // Vertical-Right
// Vertical-Right
static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1);
const int I = top[-2];
const int J = top[-3];
@ -924,8 +955,9 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
DST(0, 3) = AVG3(K, J, I);
}
static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
const uint8_t* top) { // Vertical-Left
// Vertical-Left
static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -951,8 +983,9 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
DST(3, 3) = (extra_out >> 8) & 0xff;
}
static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
const uint8_t* top) { // Down-right
// Down-right
static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1);
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@ -968,7 +1001,8 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
@ -983,7 +1017,8 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -1006,7 +1041,8 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J);
}
static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@ -1028,7 +1064,8 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4_SSE2(I4DC4 + dst, top);
TM4_SSE2(I4TM4 + dst, top);
VE4_SSE2(I4VE4 + dst, top);
@ -1044,8 +1081,9 @@ static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block
DC8uvMode_SSE2(C8DC8 + dst, left, top);
VerticalPred_SSE2(C8VE8 + dst, top, 8);
@ -1064,8 +1102,9 @@ static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_SSE2(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DC16Mode_SSE2(I16DC16 + dst, left, top);
VerticalPred_SSE2(I16VE16 + dst, top, 16);
HorizontalPred_SSE2(I16HE16 + dst, left, 16);
@ -1092,7 +1131,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
*sum = _mm_add_epi32(sum1, sum2);
}
static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b,
int num_pairs) {
__m128i sum = _mm_setzero_si128();
int32_t tmp[4];
@ -1114,18 +1154,21 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return SSE_16xN_SSE2(a, b, 8);
}
static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return SSE_16xN_SSE2(a, b, 4);
}
#define LOAD_8x16b(ptr) \
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
const __m128i zero = _mm_setzero_si128();
int num_pairs = 4;
__m128i sum = zero;
@ -1152,7 +1195,8 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
}
#undef LOAD_8x16b
static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
const __m128i zero = _mm_setzero_si128();
// Load values. Note that we read 8 pixels instead of 4,
@ -1189,7 +1233,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
const __m128i mask = _mm_set1_epi16(0x00ff);
const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@ -1227,8 +1271,9 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA,
const uint8_t* WEBP_RESTRICT inB,
const uint16_t* WEBP_RESTRICT const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
@ -1328,14 +1373,16 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int diff_sum = TTransform_SSE2(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -1350,9 +1397,10 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
// Quantization
//
static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
static WEBP_INLINE int DoQuantizeBlock_SSE2(
int16_t in[16], int16_t out[16],
const uint16_t* WEBP_RESTRICT const sharpen,
const VP8Matrix* WEBP_RESTRICT const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
@ -1463,17 +1511,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
}
static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
}
static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;

View File

@ -23,9 +23,10 @@
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block,
VP8Histogram* const histo) {
VP8Histogram* WEBP_RESTRICT const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -168,14 +169,16 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -301,17 +304,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
#undef PSHUFB_CST
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
}
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;

View File

@ -23,55 +23,42 @@
do { \
assert((in) != NULL); \
assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0)
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
static WEBP_INLINE void PredictLine_C(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT pred,
uint8_t* WEBP_RESTRICT dst, int length) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
} else {
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
}
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds = in;
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
// Leftmost pixel is predicted from above.
PredictLine_C(in, preds - stride, out, 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
++row;
PredictLine_C(in, preds - stride, out, 1);
PredictLine_C(in + 1, preds, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
@ -81,35 +68,23 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds = in;
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine_C(in + 1, preds, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
PredictLine_C(in, preds, out, width, inverse);
++row;
for (row = 1; row < height; ++row) {
PredictLine_C(in, preds, out, width);
preds += stride;
in += stride;
out += stride;
@ -126,40 +101,31 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds = in;
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
int w;
// leftmost pixel: predict from above.
PredictLine_C(in, preds - stride, out, 1, inverse);
PredictLine_C(in, preds - stride, out, 1);
for (w = 1; w < width; ++w) {
const int pred = GradientPredictor_C(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred));
out[w] = (uint8_t)(in[w] - pred);
}
++row;
preds += stride;
in += stride;
out += stride;
@ -172,20 +138,22 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
//------------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
static void HorizontalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
filtered_data);
static void HorizontalFilter_C(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoHorizontalFilter_C(data, width, height, stride, filtered_data);
}
static void VerticalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
static void VerticalFilter_C(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoVerticalFilter_C(data, width, height, stride, filtered_data);
}
static void GradientFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
static void GradientFilter_C(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoGradientFilter_C(data, width, height, stride, filtered_data);
}
#endif // !WEBP_NEON_OMIT_C_CODE

View File

@ -26,13 +26,12 @@
#define DCHECK(in, out) \
do { \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0)
#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \
@ -103,7 +102,8 @@
); \
} while (0)
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst,
int length) {
DO_PREDICT_LINE(src, dst, length, 0);
}
@ -184,99 +184,75 @@ static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
// Horizontal filter.
#define FILTER_LINE_BY_LINE do { \
while (row < last_row) { \
for (row = 1; row < height; ++row) { \
PREDICT_LINE_ONE_PASS(in, preds - stride, out); \
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \
++row; \
preds += stride; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(
const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds = in;
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
// Filter line-by-line.
FILTER_LINE_BY_LINE;
}
#undef FILTER_LINE_BY_LINE
static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
static void HorizontalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
}
//------------------------------------------------------------------------------
// Vertical filter.
#define FILTER_LINE_BY_LINE do { \
while (row < last_row) { \
for (row = 1; row < height; ++row) { \
DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \
++row; \
preds += stride; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(
const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds = in;
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
FILTER_LINE_BY_LINE;
}
#undef FILTER_LINE_BY_LINE
static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
static void VerticalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoVerticalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
}
//------------------------------------------------------------------------------
@ -297,7 +273,7 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
}
#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \
while (row < last_row) { \
for (row = 1; row < height; ++row) { \
int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
for (w = 1; w < width; ++w) { \
@ -306,42 +282,34 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \
} \
++row; \
in += stride; \
out += stride; \
} \
} while (0)
static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
static void DoGradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds = in;
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
out += stride;
}
out[0] = in[0];
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
// Filter line-by-line.
FILTER_LINE_BY_LINE(in, -);
}
#undef FILTER_LINE_BY_LINE
static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
static void GradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoGradientFilter_MIPSdspR2(data, width, height, stride, filtered_data);
}
//------------------------------------------------------------------------------

View File

@ -21,7 +21,8 @@
static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
const uint8_t* pred,
uint8_t* dst, int length) {
uint8_t* WEBP_RESTRICT dst,
int length) {
v16u8 src0, pred0, dst0;
assert(length >= 0);
while (length >= 32) {
@ -58,8 +59,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
#define DCHECK(in, out) \
do { \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
@ -68,8 +70,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
//------------------------------------------------------------------------------
// Horrizontal filter
static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void HorizontalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
const uint8_t* preds = data;
const uint8_t* in = data;
uint8_t* out = filtered_data;
@ -99,8 +102,8 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
const uint8_t* ppred,
uint8_t* poutput, int stride,
int size) {
uint8_t* WEBP_RESTRICT poutput,
int stride, int size) {
int w;
const v16i8 zero = { 0 };
while (size >= 16) {
@ -131,8 +134,9 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
}
static void GradientFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void GradientFilter_MSA(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@ -159,8 +163,9 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height,
//------------------------------------------------------------------------------
// Vertical filter
static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void VerticalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;

View File

@ -23,13 +23,12 @@
#define DCHECK(in, out) \
do { \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0)
// load eight u8 and widen to s16
@ -46,7 +45,7 @@
#define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8)
static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
uint8_t* WEBP_RESTRICT dst, int length) {
int i;
assert(length >= 0);
for (i = 0; i + 16 <= length; i += 16) {
@ -59,86 +58,70 @@ static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
static void PredictLineLeft_NEON(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst, int length) {
PredictLine_NEON(src, src - 1, dst, length);
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
static WEBP_INLINE void DoHorizontalFilter_NEON(
const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
uint8_t* WEBP_RESTRICT out) {
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
}
static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
static void HorizontalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoHorizontalFilter_NEON(data, width, height, stride, filtered_data);
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
PredictLine_NEON(in, in - stride, out, width);
++row;
in += stride;
out += stride;
}
}
static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
static void VerticalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoVerticalFilter_NEON(data, width, height, stride, filtered_data);
}
//------------------------------------------------------------------------------
@ -151,7 +134,8 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
static void GradientPredictDirect_NEON(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
uint8_t* WEBP_RESTRICT const out,
int length) {
int i;
for (i = 0; i + 8 <= length; i += 8) {
const uint8x8_t A = vld1_u8(&row[i - 1]);
@ -167,40 +151,31 @@ static void GradientPredictDirect_NEON(const uint8_t* const row,
}
}
static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT out) {
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
out[0] = in[0];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
out[0] = in[0] - in[-stride];
GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
}
static void GradientFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
static void GradientFilter_NEON(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoGradientFilter_NEON(data, width, height, stride, filtered_data);
}
#undef DCHECK

View File

@ -27,15 +27,15 @@
do { \
assert((in) != NULL); \
assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0)
static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
static void PredictLineTop_SSE2(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT pred,
uint8_t* WEBP_RESTRICT dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@ -53,7 +53,8 @@ static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
static void PredictLineLeft_SSE2(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@ -73,32 +74,23 @@ static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
static WEBP_INLINE void DoHorizontalFilter_SSE2(
const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
uint8_t* WEBP_RESTRICT out) {
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
@ -107,30 +99,22 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
PredictLineTop_SSE2(in, in - stride, out, width);
++row;
in += stride;
out += stride;
}
@ -146,7 +130,8 @@ static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
static void GradientPredictDirect_SSE2(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
uint8_t* WEBP_RESTRICT const out,
int length) {
const int max_pos = length & ~7;
int i;
const __m128i zero = _mm_setzero_si128();
@ -170,30 +155,22 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
}
}
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
uint8_t* WEBP_RESTRICT out) {
int row;
DCHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
out[0] = in[0];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < last_row) {
for (row = 1; row < height; ++row) {
out[0] = (uint8_t)(in[0] - in[-stride]);
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
@ -203,20 +180,22 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
//------------------------------------------------------------------------------
static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
filtered_data);
static void HorizontalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoHorizontalFilter_SSE2(data, width, height, stride, filtered_data);
}
static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
static void VerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoVerticalFilter_SSE2(data, width, height, stride, filtered_data);
}
static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
static void GradientFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
DoGradientFilter_SSE2(data, width, height, stride, filtered_data);
}
//------------------------------------------------------------------------------

View File

@ -107,14 +107,14 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------
// Predictors
uint32_t VP8LPredictor0_C(const uint32_t* const left,
const uint32_t* const top) {
static uint32_t VP8LPredictor0_C(const uint32_t* const left,
const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
uint32_t VP8LPredictor1_C(const uint32_t* const left,
const uint32_t* const top) {
static uint32_t VP8LPredictor1_C(const uint32_t* const left,
const uint32_t* const top) {
(void)top;
return *left;
}
@ -182,13 +182,13 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
}
static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int x;
(void)upper;
for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
}
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint32_t left = out[-1];
(void)upper;
@ -441,8 +441,8 @@ static int is_big_endian(void) {
return (tmp.b[0] != 1);
}
void VP8LConvertBGRAToRGB_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@ -452,8 +452,8 @@ void VP8LConvertBGRAToRGB_C(const uint32_t* src,
}
}
void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@ -464,8 +464,8 @@ void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
}
}
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@ -481,8 +481,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
}
}
void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@ -498,8 +498,8 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
}
}
void VP8LConvertBGRAToBGR_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
@ -509,8 +509,8 @@ void VP8LConvertBGRAToBGR_C(const uint32_t* src,
}
}
static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
int swap_on_big_endian) {
static void CopyOrSwap(const uint32_t* WEBP_RESTRICT src, int num_pixels,
uint8_t* WEBP_RESTRICT dst, int swap_on_big_endian) {
if (is_big_endian() == swap_on_big_endian) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {

View File

@ -18,6 +18,7 @@
#include "src/webp/types.h"
#include "src/webp/decode.h"
#include "src/dsp/dsp.h"
#include "src/enc/histogram_enc.h"
#include "src/utils/utils.h"
@ -32,10 +33,6 @@ typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16];
uint32_t VP8LPredictor0_C(const uint32_t* const left,
const uint32_t* const top);
uint32_t VP8LPredictor1_C(const uint32_t* const left,
const uint32_t* const top);
uint32_t VP8LPredictor2_C(const uint32_t* const left,
const uint32_t* const top);
uint32_t VP8LPredictor3_C(const uint32_t* const left,
@ -64,7 +61,7 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
// These Add/Sub function expects upper[-1] and out[-1] to be readable.
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
const uint32_t* upper, int num_pixels,
uint32_t* out);
uint32_t* WEBP_RESTRICT out);
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
@ -95,8 +92,8 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
const uint32_t* const in, uint32_t* const out);
// Color space conversion.
typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
uint8_t* dst);
typedef void (*VP8LConvertFunc)(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst);
extern VP8LConvertFunc VP8LConvertBGRAToRGB;
extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
@ -131,13 +128,16 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst);
void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
uint8_t* WEBP_RESTRICT dst);
void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
uint32_t* dst);
@ -149,32 +149,35 @@ void VP8LDspInit(void);
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* dst, int num_pixels);
typedef void (*VP8LTransformColorFunc)(
const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT dst,
int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride,
const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int histo[]);
int green_to_blue, int red_to_blue, uint32_t histo[]);
extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
typedef void (*VP8LCollectColorRedTransformsFunc)(
const uint32_t* argb, int stride,
const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]);
int green_to_red, uint32_t histo[]);
extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
// Expose some C-only fallback functions
void VP8LTransformColor_C(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels);
void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT data, int num_pixels);
void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]);
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
int green_to_red, uint32_t histo[]);
void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]);
uint32_t histo[]);
extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
@ -183,14 +186,17 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
// Huffman-cost related functions.
typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* WEBP_RESTRICT X,
const uint32_t* WEBP_RESTRICT Y,
int length);
typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
const int Y[256]);
typedef uint64_t (*VP8LCombinedShannonEntropyFunc)(const uint32_t X[256],
const uint32_t Y[256]);
typedef uint64_t (*VP8LShannonEntropyFunc)(const uint32_t* X, int length);
extern VP8LCostFunc VP8LExtraCost;
extern VP8LCostCombinedFunc VP8LExtraCostCombined;
extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
extern VP8LShannonEntropyFunc VP8LShannonEntropy;
typedef struct { // small struct to hold counters
int counts[2]; // index: 0=zero streak, 1=non-zero streak
@ -198,7 +204,7 @@ typedef struct { // small struct to hold counters
} VP8LStreaks;
typedef struct { // small struct to hold bit entropy results
float entropy; // entropy
uint64_t entropy; // entropy
uint32_t sum; // sum of the population
int nonzeros; // number of non-zero elements in the population
uint32_t max_val; // maximum value in the population
@ -212,26 +218,30 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
// codec specific heuristics.
typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
const uint32_t X[], const uint32_t Y[], int length,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats);
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats);
extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
// Get the entropy for the distribution 'X'.
typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats);
typedef void (*VP8LGetEntropyUnrefinedFunc)(
const uint32_t X[], int length,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats);
extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
VP8LBitEntropy* const entropy);
void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
VP8LBitEntropy* WEBP_RESTRICT const entropy);
typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b,
uint32_t* out, int size);
typedef void (*VP8LAddVectorFunc)(const uint32_t* WEBP_RESTRICT a,
const uint32_t* WEBP_RESTRICT b,
uint32_t* WEBP_RESTRICT out, int size);
extern VP8LAddVectorFunc VP8LAddVector;
typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size);
typedef void (*VP8LAddVectorEqFunc)(const uint32_t* WEBP_RESTRICT a,
uint32_t* WEBP_RESTRICT out, int size);
extern VP8LAddVectorEqFunc VP8LAddVectorEq;
void VP8LHistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out);
void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
const VP8LHistogram* WEBP_RESTRICT const b,
VP8LHistogram* WEBP_RESTRICT const out);
// -----------------------------------------------------------------------------
// PrefixEncode()
@ -241,11 +251,12 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
// Returns the first index where array1 and array2 are different.
extern VP8LVectorMismatchFunc VP8LVectorMismatch;
typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
int xbits, uint32_t* dst);
typedef void (*VP8LBundleColorMapFunc)(const uint8_t* WEBP_RESTRICT const row,
int width, int xbits,
uint32_t* WEBP_RESTRICT dst);
extern VP8LBundleColorMapFunc VP8LBundleColorMap;
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
uint32_t* dst);
void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
int width, int xbits, uint32_t* WEBP_RESTRICT dst);
// Must be called before calling any of the above methods.
void VP8LEncDspInit(void);

View File

@ -73,23 +73,44 @@ static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
// Keeping a high threshold for now.
#define APPROX_LOG_WITH_CORRECTION_MAX 65536
#define APPROX_LOG_MAX 4096
// VP8LFastLog2 and VP8LFastSLog2 are used on elements from image histograms.
// The histogram values cannot exceed the maximum number of pixels, which
// is (1 << 14) * (1 << 14). Therefore S * log(S) < (1 << 33).
// No more than 32 bits of precision should be chosen.
// To match the original float implementation, 23 bits of precision are used.
#define LOG_2_PRECISION_BITS 23
#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
// LOG_2_RECIPROCAL * (1 << LOG_2_PRECISION_BITS)
#define LOG_2_RECIPROCAL_FIXED_DOUBLE 12102203.161561485379934310913085937500
#define LOG_2_RECIPROCAL_FIXED ((uint64_t)12102203)
#define LOG_LOOKUP_IDX_MAX 256
extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
extern const uint32_t kLog2Table[LOG_LOOKUP_IDX_MAX];
extern const uint64_t kSLog2Table[LOG_LOOKUP_IDX_MAX];
typedef uint32_t (*VP8LFastLog2SlowFunc)(uint32_t v);
typedef uint64_t (*VP8LFastSLog2SlowFunc)(uint32_t v);
extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
extern VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
static WEBP_INLINE uint32_t VP8LFastLog2(uint32_t v) {
return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
}
// Fast calculation of v * log2(v) for integer input.
static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
static WEBP_INLINE uint64_t VP8LFastSLog2(uint32_t v) {
return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
}
static WEBP_INLINE uint64_t RightShiftRound(uint64_t v, uint32_t shift) {
return (v + (1ull << shift >> 1)) >> shift;
}
static WEBP_INLINE int64_t DivRound(int64_t a, int64_t b) {
return ((a < 0) == (b < 0)) ? ((a + b / 2) / b) : ((a - b / 2) / b);
}
#define WEBP_INT64_MAX ((int64_t)((1ull << 63) - 1))
#define WEBP_UINT64_MAX (~0ull)
// -----------------------------------------------------------------------------
// PrefixEncode()
@ -173,15 +194,15 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
// The predictor is added to the output pixel (which
// is therefore considered as a residual) to get the final prediction.
#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \
static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \
out[x] = VP8LAddPixels(in[x], pred); \
} \
#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \
static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* WEBP_RESTRICT out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \
out[x] = VP8LAddPixels(in[x], pred); \
} \
}
#ifdef __cplusplus

View File

@ -24,203 +24,123 @@
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
// lookup table for small values of log2(int)
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
0.0000000000000000f, 0.0000000000000000f,
1.0000000000000000f, 1.5849625007211560f,
2.0000000000000000f, 2.3219280948873621f,
2.5849625007211560f, 2.8073549220576041f,
3.0000000000000000f, 3.1699250014423121f,
3.3219280948873621f, 3.4594316186372973f,
3.5849625007211560f, 3.7004397181410921f,
3.8073549220576041f, 3.9068905956085187f,
4.0000000000000000f, 4.0874628412503390f,
4.1699250014423121f, 4.2479275134435852f,
4.3219280948873626f, 4.3923174227787606f,
4.4594316186372973f, 4.5235619560570130f,
4.5849625007211560f, 4.6438561897747243f,
4.7004397181410917f, 4.7548875021634682f,
4.8073549220576037f, 4.8579809951275718f,
4.9068905956085187f, 4.9541963103868749f,
5.0000000000000000f, 5.0443941193584533f,
5.0874628412503390f, 5.1292830169449663f,
5.1699250014423121f, 5.2094533656289501f,
5.2479275134435852f, 5.2854022188622487f,
5.3219280948873626f, 5.3575520046180837f,
5.3923174227787606f, 5.4262647547020979f,
5.4594316186372973f, 5.4918530963296747f,
5.5235619560570130f, 5.5545888516776376f,
5.5849625007211560f, 5.6147098441152083f,
5.6438561897747243f, 5.6724253419714951f,
5.7004397181410917f, 5.7279204545631987f,
5.7548875021634682f, 5.7813597135246599f,
5.8073549220576037f, 5.8328900141647412f,
5.8579809951275718f, 5.8826430493618415f,
5.9068905956085187f, 5.9307373375628866f,
5.9541963103868749f, 5.9772799234999167f,
6.0000000000000000f, 6.0223678130284543f,
6.0443941193584533f, 6.0660891904577720f,
6.0874628412503390f, 6.1085244567781691f,
6.1292830169449663f, 6.1497471195046822f,
6.1699250014423121f, 6.1898245588800175f,
6.2094533656289501f, 6.2288186904958804f,
6.2479275134435852f, 6.2667865406949010f,
6.2854022188622487f, 6.3037807481771030f,
6.3219280948873626f, 6.3398500028846243f,
6.3575520046180837f, 6.3750394313469245f,
6.3923174227787606f, 6.4093909361377017f,
6.4262647547020979f, 6.4429434958487279f,
6.4594316186372973f, 6.4757334309663976f,
6.4918530963296747f, 6.5077946401986963f,
6.5235619560570130f, 6.5391588111080309f,
6.5545888516776376f, 6.5698556083309478f,
6.5849625007211560f, 6.5999128421871278f,
6.6147098441152083f, 6.6293566200796094f,
6.6438561897747243f, 6.6582114827517946f,
6.6724253419714951f, 6.6865005271832185f,
6.7004397181410917f, 6.7142455176661224f,
6.7279204545631987f, 6.7414669864011464f,
6.7548875021634682f, 6.7681843247769259f,
6.7813597135246599f, 6.7944158663501061f,
6.8073549220576037f, 6.8201789624151878f,
6.8328900141647412f, 6.8454900509443747f,
6.8579809951275718f, 6.8703647195834047f,
6.8826430493618415f, 6.8948177633079437f,
6.9068905956085187f, 6.9188632372745946f,
6.9307373375628866f, 6.9425145053392398f,
6.9541963103868749f, 6.9657842846620869f,
6.9772799234999167f, 6.9886846867721654f,
7.0000000000000000f, 7.0112272554232539f,
7.0223678130284543f, 7.0334230015374501f,
7.0443941193584533f, 7.0552824355011898f,
7.0660891904577720f, 7.0768155970508308f,
7.0874628412503390f, 7.0980320829605263f,
7.1085244567781691f, 7.1189410727235076f,
7.1292830169449663f, 7.1395513523987936f,
7.1497471195046822f, 7.1598713367783890f,
7.1699250014423121f, 7.1799090900149344f,
7.1898245588800175f, 7.1996723448363644f,
7.2094533656289501f, 7.2191685204621611f,
7.2288186904958804f, 7.2384047393250785f,
7.2479275134435852f, 7.2573878426926521f,
7.2667865406949010f, 7.2761244052742375f,
7.2854022188622487f, 7.2946207488916270f,
7.3037807481771030f, 7.3128829552843557f,
7.3219280948873626f, 7.3309168781146167f,
7.3398500028846243f, 7.3487281542310771f,
7.3575520046180837f, 7.3663222142458160f,
7.3750394313469245f, 7.3837042924740519f,
7.3923174227787606f, 7.4008794362821843f,
7.4093909361377017f, 7.4178525148858982f,
7.4262647547020979f, 7.4346282276367245f,
7.4429434958487279f, 7.4512111118323289f,
7.4594316186372973f, 7.4676055500829976f,
7.4757334309663976f, 7.4838157772642563f,
7.4918530963296747f, 7.4998458870832056f,
7.5077946401986963f, 7.5156998382840427f,
7.5235619560570130f, 7.5313814605163118f,
7.5391588111080309f, 7.5468944598876364f,
7.5545888516776376f, 7.5622424242210728f,
7.5698556083309478f, 7.5774288280357486f,
7.5849625007211560f, 7.5924570372680806f,
7.5999128421871278f, 7.6073303137496104f,
7.6147098441152083f, 7.6220518194563764f,
7.6293566200796094f, 7.6366246205436487f,
7.6438561897747243f, 7.6510516911789281f,
7.6582114827517946f, 7.6653359171851764f,
7.6724253419714951f, 7.6794800995054464f,
7.6865005271832185f, 7.6934869574993252f,
7.7004397181410917f, 7.7073591320808825f,
7.7142455176661224f, 7.7210991887071855f,
7.7279204545631987f, 7.7347096202258383f,
7.7414669864011464f, 7.7481928495894605f,
7.7548875021634682f, 7.7615512324444795f,
7.7681843247769259f, 7.7747870596011736f,
7.7813597135246599f, 7.7879025593914317f,
7.7944158663501061f, 7.8008998999203047f,
7.8073549220576037f, 7.8137811912170374f,
7.8201789624151878f, 7.8265484872909150f,
7.8328900141647412f, 7.8392037880969436f,
7.8454900509443747f, 7.8517490414160571f,
7.8579809951275718f, 7.8641861446542797f,
7.8703647195834047f, 7.8765169465649993f,
7.8826430493618415f, 7.8887432488982591f,
7.8948177633079437f, 7.9008668079807486f,
7.9068905956085187f, 7.9128893362299619f,
7.9188632372745946f, 7.9248125036057812f,
7.9307373375628866f, 7.9366379390025709f,
7.9425145053392398f, 7.9483672315846778f,
7.9541963103868749f, 7.9600019320680805f,
7.9657842846620869f, 7.9715435539507719f,
7.9772799234999167f, 7.9829935746943103f,
7.9886846867721654f, 7.9943534368588577f
// lookup table for small values of log2(int) * (1 << LOG_2_PRECISION_BITS).
// Obtained in Python with:
// a = [ str(round((1<<23)*math.log2(i))) if i else "0" for i in range(256)]
// print(',\n'.join([' '+','.join(v)
// for v in batched([i.rjust(9) for i in a],7)]))
const uint32_t kLog2Table[LOG_LOOKUP_IDX_MAX] = {
0, 0, 8388608, 13295629, 16777216, 19477745, 21684237,
23549800, 25165824, 26591258, 27866353, 29019816, 30072845, 31041538,
31938408, 32773374, 33554432, 34288123, 34979866, 35634199, 36254961,
36845429, 37408424, 37946388, 38461453, 38955489, 39430146, 39886887,
40327016, 40751698, 41161982, 41558811, 41943040, 42315445, 42676731,
43027545, 43368474, 43700062, 44022807, 44337167, 44643569, 44942404,
45234037, 45518808, 45797032, 46069003, 46334996, 46595268, 46850061,
47099600, 47344097, 47583753, 47818754, 48049279, 48275495, 48497560,
48715624, 48929828, 49140306, 49347187, 49550590, 49750631, 49947419,
50141058, 50331648, 50519283, 50704053, 50886044, 51065339, 51242017,
51416153, 51587818, 51757082, 51924012, 52088670, 52251118, 52411415,
52569616, 52725775, 52879946, 53032177, 53182516, 53331012, 53477707,
53622645, 53765868, 53907416, 54047327, 54185640, 54322389, 54457611,
54591338, 54723604, 54854440, 54983876, 55111943, 55238669, 55364082,
55488208, 55611074, 55732705, 55853126, 55972361, 56090432, 56207362,
56323174, 56437887, 56551524, 56664103, 56775645, 56886168, 56995691,
57104232, 57211808, 57318436, 57424133, 57528914, 57632796, 57735795,
57837923, 57939198, 58039632, 58139239, 58238033, 58336027, 58433234,
58529666, 58625336, 58720256, 58814437, 58907891, 59000628, 59092661,
59183999, 59274652, 59364632, 59453947, 59542609, 59630625, 59718006,
59804761, 59890898, 59976426, 60061354, 60145690, 60229443, 60312620,
60395229, 60477278, 60558775, 60639726, 60720140, 60800023, 60879382,
60958224, 61036555, 61114383, 61191714, 61268554, 61344908, 61420785,
61496188, 61571124, 61645600, 61719620, 61793189, 61866315, 61939001,
62011253, 62083076, 62154476, 62225457, 62296024, 62366182, 62435935,
62505289, 62574248, 62642816, 62710997, 62778797, 62846219, 62913267,
62979946, 63046260, 63112212, 63177807, 63243048, 63307939, 63372484,
63436687, 63500551, 63564080, 63627277, 63690146, 63752690, 63814912,
63876816, 63938405, 63999682, 64060650, 64121313, 64181673, 64241734,
64301498, 64360969, 64420148, 64479040, 64537646, 64595970, 64654014,
64711782, 64769274, 64826495, 64883447, 64940132, 64996553, 65052711,
65108611, 65164253, 65219641, 65274776, 65329662, 65384299, 65438691,
65492840, 65546747, 65600416, 65653847, 65707044, 65760008, 65812741,
65865245, 65917522, 65969575, 66021404, 66073013, 66124403, 66175575,
66226531, 66277275, 66327806, 66378127, 66428240, 66478146, 66527847,
66577345, 66626641, 66675737, 66724635, 66773336, 66821842, 66870154,
66918274, 66966204, 67013944, 67061497
};
const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
0.00000000f, 0.00000000f, 2.00000000f, 4.75488750f,
8.00000000f, 11.60964047f, 15.50977500f, 19.65148445f,
24.00000000f, 28.52932501f, 33.21928095f, 38.05374781f,
43.01955001f, 48.10571634f, 53.30296891f, 58.60335893f,
64.00000000f, 69.48686830f, 75.05865003f, 80.71062276f,
86.43856190f, 92.23866588f, 98.10749561f, 104.04192499f,
110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
// lookup table for small values of int*log2(int) * (1 << LOG_2_PRECISION_BITS).
// Obtained in Python with:
// a=[ "%d"%i if i<(1<<32) else "%dull"%i
// for i in [ round((1<<LOG_2_PRECISION_BITS)*math.log2(i)*i) if i
// else 0 for i in range(256)]]
// print(',\n '.join([','.join(v) for v in batched([i.rjust(15)
// for i in a],4)]))
const uint64_t kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
0, 0, 16777216, 39886887,
67108864, 97388723, 130105423, 164848600,
201326592, 239321324, 278663526, 319217973,
360874141, 403539997, 447137711, 491600606,
536870912, 582898099, 629637592, 677049776,
725099212, 773754010, 822985323, 872766924,
923074875, 973887230, 1025183802, 1076945958,
1129156447, 1181799249, 1234859451, 1288323135,
1342177280, 1396409681, 1451008871, 1505964059,
1561265072, 1616902301, 1672866655, 1729149526,
1785742744, 1842638548, 1899829557, 1957308741,
2015069397, 2073105127, 2131409817, 2189977618ull,
2248802933ull, 2307880396ull, 2367204859ull, 2426771383ull,
2486575220ull, 2546611805ull, 2606876748ull, 2667365819ull,
2728074942ull, 2789000187ull, 2850137762ull, 2911484006ull,
2973035382ull, 3034788471ull, 3096739966ull, 3158886666ull,
3221225472ull, 3283753383ull, 3346467489ull, 3409364969ull,
3472443085ull, 3535699182ull, 3599130679ull, 3662735070ull,
3726509920ull, 3790452862ull, 3854561593ull, 3918833872ull,
3983267519ull, 4047860410ull, 4112610476ull, 4177515704ull,
4242574127ull, 4307783833ull, 4373142952ull, 4438649662ull,
4504302186ull, 4570098787ull, 4636037770ull, 4702117480ull,
4768336298ull, 4834692645ull, 4901184974ull, 4967811774ull,
5034571569ull, 5101462912ull, 5168484389ull, 5235634615ull,
5302912235ull, 5370315922ull, 5437844376ull, 5505496324ull,
5573270518ull, 5641165737ull, 5709180782ull, 5777314477ull,
5845565671ull, 5913933235ull, 5982416059ull, 6051013057ull,
6119723161ull, 6188545324ull, 6257478518ull, 6326521733ull,
6395673979ull, 6464934282ull, 6534301685ull, 6603775250ull,
6673354052ull, 6743037185ull, 6812823756ull, 6882712890ull,
6952703725ull, 7022795412ull, 7092987118ull, 7163278025ull,
7233667324ull, 7304154222ull, 7374737939ull, 7445417707ull,
7516192768ull, 7587062379ull, 7658025806ull, 7729082328ull,
7800231234ull, 7871471825ull, 7942803410ull, 8014225311ull,
8085736859ull, 8157337394ull, 8229026267ull, 8300802839ull,
8372666477ull, 8444616560ull, 8516652476ull, 8588773618ull,
8660979393ull, 8733269211ull, 8805642493ull, 8878098667ull,
8950637170ull, 9023257446ull, 9095958945ull, 9168741125ull,
9241603454ull, 9314545403ull, 9387566451ull, 9460666086ull,
9533843800ull, 9607099093ull, 9680431471ull, 9753840445ull,
9827325535ull, 9900886263ull, 9974522161ull, 10048232765ull,
10122017615ull, 10195876260ull, 10269808253ull, 10343813150ull,
10417890516ull, 10492039919ull, 10566260934ull, 10640553138ull,
10714916116ull, 10789349456ull, 10863852751ull, 10938425600ull,
11013067604ull, 11087778372ull, 11162557513ull, 11237404645ull,
11312319387ull, 11387301364ull, 11462350205ull, 11537465541ull,
11612647010ull, 11687894253ull, 11763206912ull, 11838584638ull,
11914027082ull, 11989533899ull, 12065104750ull, 12140739296ull,
12216437206ull, 12292198148ull, 12368021795ull, 12443907826ull,
12519855920ull, 12595865759ull, 12671937032ull, 12748069427ull,
12824262637ull, 12900516358ull, 12976830290ull, 13053204134ull,
13129637595ull, 13206130381ull, 13282682202ull, 13359292772ull,
13435961806ull, 13512689025ull, 13589474149ull, 13666316903ull,
13743217014ull, 13820174211ull, 13897188225ull, 13974258793ull,
14051385649ull, 14128568535ull, 14205807192ull, 14283101363ull,
14360450796ull, 14437855239ull, 14515314443ull, 14592828162ull,
14670396151ull, 14748018167ull, 14825693972ull, 14903423326ull,
14981205995ull, 15059041743ull, 15136930339ull, 15214871554ull,
15292865160ull, 15370910930ull, 15449008641ull, 15527158071ull,
15605359001ull, 15683611210ull, 15761914485ull, 15840268608ull,
15918673369ull, 15997128556ull, 16075633960ull, 16154189373ull,
16232794589ull, 16311449405ull, 16390153617ull, 16468907026ull,
16547709431ull, 16626560636ull, 16705460444ull, 16784408661ull,
16863405094ull, 16942449552ull, 17021541845ull, 17100681785ull
};
const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
@ -326,23 +246,19 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
};
static float FastSLog2Slow_C(uint32_t v) {
static uint64_t FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
const uint64_t orig_v = v;
uint64_t correction;
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available
const int log_cnt = BitsLog2Floor(v) - 7;
const uint64_t log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
v >>= log_cnt;
#else
int log_cnt = 0;
uint64_t log_cnt = 0;
uint32_t y = 1;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
do {
++log_cnt;
v = v >> 1;
@ -354,45 +270,43 @@ static float FastSLog2Slow_C(uint32_t v) {
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
correction = (23 * (orig_v & (y - 1))) >> 4;
return v_f * (kLog2Table[v] + log_cnt) + correction;
correction = LOG_2_RECIPROCAL_FIXED * (orig_v & (y - 1));
return orig_v * (kLog2Table[v] + (log_cnt << LOG_2_PRECISION_BITS)) +
correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);
}
}
static float FastLog2Slow_C(uint32_t v) {
static uint32_t FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
const uint32_t orig_v = v;
uint32_t log_2;
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available
const int log_cnt = BitsLog2Floor(v) - 7;
const uint32_t log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt;
const uint32_t orig_v = v;
double log_2;
v >>= log_cnt;
#else
int log_cnt = 0;
uint32_t log_cnt = 0;
uint32_t y = 1;
const uint32_t orig_v = v;
double log_2;
do {
++log_cnt;
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
#endif
log_2 = kLog2Table[v] + log_cnt;
log_2 = kLog2Table[v] + (log_cnt << LOG_2_PRECISION_BITS);
if (orig_v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const int correction = (23 * (orig_v & (y - 1))) >> 4;
log_2 += (double)correction / orig_v;
const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (orig_v & (y - 1));
log_2 += (uint32_t)DivRound(correction, orig_v);
}
return (float)log_2;
return log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);
}
}
@ -400,37 +314,53 @@ static float FastLog2Slow_C(uint32_t v) {
// Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
static uint64_t CombinedShannonEntropy_C(const uint32_t X[256],
const uint32_t Y[256]) {
int i;
float retval = 0.f;
int sumX = 0, sumXY = 0;
uint64_t retval = 0;
uint32_t sumX = 0, sumXY = 0;
for (i = 0; i < 256; ++i) {
const int x = X[i];
const uint32_t x = X[i];
if (x != 0) {
const int xy = x + Y[i];
const uint32_t xy = x + Y[i];
sumX += x;
retval -= VP8LFastSLog2(x);
retval += VP8LFastSLog2(x);
sumXY += xy;
retval -= VP8LFastSLog2(xy);
retval += VP8LFastSLog2(xy);
} else if (Y[i] != 0) {
sumXY += Y[i];
retval -= VP8LFastSLog2(Y[i]);
retval += VP8LFastSLog2(Y[i]);
}
}
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
return retval;
}
static uint64_t ShannonEntropy_C(const uint32_t* X, int n) {
int i;
uint64_t retval = 0;
uint32_t sumX = 0;
for (i = 0; i < n; ++i) {
const int x = X[i];
if (x != 0) {
sumX += x;
retval += VP8LFastSLog2(x);
}
}
retval = VP8LFastSLog2(sumX) - retval;
return retval;
}
void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
entropy->entropy = 0.;
entropy->entropy = 0;
entropy->sum = 0;
entropy->nonzeros = 0;
entropy->max_val = 0;
entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
}
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
VP8LBitEntropy* const entropy) {
void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
VP8LBitEntropy* WEBP_RESTRICT const entropy) {
int i;
VP8LBitEntropyInit(entropy);
@ -440,18 +370,20 @@ void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
entropy->sum += array[i];
entropy->nonzero_code = i;
++entropy->nonzeros;
entropy->entropy -= VP8LFastSLog2(array[i]);
entropy->entropy += VP8LFastSLog2(array[i]);
if (entropy->max_val < array[i]) {
entropy->max_val = array[i];
}
}
}
entropy->entropy += VP8LFastSLog2(entropy->sum);
entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;
}
static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
int* WEBP_RESTRICT const i_prev,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
const int streak = i - *i_prev;
// Gather info for the bit entropy.
@ -459,7 +391,7 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev;
}
@ -473,9 +405,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
static void GetEntropyUnrefined_C(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetEntropyUnrefined_C(
const uint32_t X[], int length,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@ -491,14 +424,13 @@ static void GetEntropyUnrefined_C(const uint32_t X[], int length,
}
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
}
static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetCombinedEntropyUnrefined_C(
const uint32_t X[], const uint32_t Y[], int length,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
@ -514,7 +446,7 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
}
//------------------------------------------------------------------------------
@ -538,8 +470,8 @@ static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
return (int8_t)(v & 0xff);
}
void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT data, int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
@ -575,9 +507,10 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff);
}
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
int green_to_red, uint32_t histo[]) {
while (tile_height-- > 0) {
int x;
for (x = 0; x < tile_width; ++x) {
@ -587,10 +520,11 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
}
}
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
uint32_t histo[]) {
while (tile_height-- > 0) {
int x;
for (x = 0; x < tile_width; ++x) {
@ -614,8 +548,8 @@ static int VectorMismatch_C(const uint32_t* const array1,
}
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
uint32_t* dst) {
void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
int width, int xbits, uint32_t* WEBP_RESTRICT dst) {
int x;
if (xbits > 0) {
const int bit_depth = 1 << (3 - xbits);
@ -646,7 +580,8 @@ static uint32_t ExtraCost_C(const uint32_t* population, int length) {
return cost;
}
static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
static uint32_t ExtraCostCombined_C(const uint32_t* WEBP_RESTRICT X,
const uint32_t* WEBP_RESTRICT Y,
int length) {
int i;
uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
@ -661,13 +596,15 @@ static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
//------------------------------------------------------------------------------
static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
static void AddVector_C(const uint32_t* WEBP_RESTRICT a,
const uint32_t* WEBP_RESTRICT b,
uint32_t* WEBP_RESTRICT out, int size) {
int i;
for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
}
static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
static void AddVectorEq_C(const uint32_t* WEBP_RESTRICT a,
uint32_t* WEBP_RESTRICT out, int size) {
int i;
for (i = 0; i < size; ++i) out[i] += a[i];
}
@ -696,8 +633,9 @@ static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
} \
} while (0)
void VP8LHistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b, VP8LHistogram* const out) {
void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
const VP8LHistogram* WEBP_RESTRICT const b,
VP8LHistogram* WEBP_RESTRICT const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
@ -727,14 +665,14 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
// Image transforms.
static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
(void)upper;
}
static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
(void)upper;
@ -745,7 +683,8 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
@ -778,11 +717,12 @@ VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
VP8LFastLog2SlowFunc VP8LFastLog2Slow;
VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
VP8LCostFunc VP8LExtraCost;
VP8LCostCombinedFunc VP8LExtraCostCombined;
VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
VP8LShannonEntropyFunc VP8LShannonEntropy;
VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
@ -822,6 +762,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
VP8LExtraCost = ExtraCost_C;
VP8LExtraCostCombined = ExtraCostCombined_C;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
VP8LShannonEntropy = ShannonEntropy_C;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
@ -911,6 +852,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
assert(VP8LExtraCost != NULL);
assert(VP8LExtraCostCombined != NULL);
assert(VP8LCombinedShannonEntropy != NULL);
assert(VP8LShannonEntropy != NULL);
assert(VP8LGetEntropyUnrefined != NULL);
assert(VP8LGetCombinedEntropyUnrefined != NULL);
assert(VP8LAddVector != NULL);

View File

@ -23,12 +23,12 @@
#include <stdlib.h>
#include <string.h>
static float FastSLog2Slow_MIPS32(uint32_t v) {
static uint64_t FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
uint32_t log_cnt, y;
uint64_t correction;
const int c24 = 24;
const float v_f = (float)v;
uint32_t temp;
// Xf = 256 = 2^8
@ -49,22 +49,23 @@ static float FastSLog2Slow_MIPS32(uint32_t v) {
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
correction = (23 * (v & (y - 1))) >> 4;
return v_f * (kLog2Table[temp] + log_cnt) + correction;
correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));
return (uint64_t)v * (kLog2Table[temp] +
((uint64_t)log_cnt << LOG_2_PRECISION_BITS)) +
correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);
}
}
static float FastLog2Slow_MIPS32(uint32_t v) {
static uint32_t FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
const int c24 = 24;
double log_2;
uint32_t log_2;
uint32_t temp;
__asm__ volatile(
@ -78,17 +79,16 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
: [c24]"r"(c24), [v]"r"(v)
);
log_2 = kLog2Table[temp] + log_cnt;
log_2 = kLog2Table[temp] + (log_cnt << LOG_2_PRECISION_BITS);
if (v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const uint32_t correction = (23 * (v & (y - 1))) >> 4;
log_2 += (double)correction / v;
const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));
log_2 += (uint32_t)DivRound(correction, v);
}
return (float)log_2;
return log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);
}
}
@ -149,8 +149,9 @@ static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
// pY += 2;
// }
// return cost;
static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
const uint32_t* const Y, int length) {
static uint32_t ExtraCostCombined_MIPS32(const uint32_t* WEBP_RESTRICT const X,
const uint32_t* WEBP_RESTRICT const Y,
int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
@ -215,8 +216,10 @@ static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
// Returns the various RLE counts
static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
int* WEBP_RESTRICT const i_prev,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int* const pstreaks = &stats->streaks[0][0];
int* const pcnts = &stats->counts[0];
int temp0, temp1, temp2, temp3;
@ -227,7 +230,7 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev;
}
@ -241,9 +244,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetEntropyUnrefined_MIPS32(
const uint32_t X[], int length,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@ -259,14 +263,13 @@ static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
}
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
}
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const entropy,
VP8LStreaks* const stats) {
static void GetCombinedEntropyUnrefined_MIPS32(
const uint32_t X[], const uint32_t Y[], int length,
VP8LBitEntropy* WEBP_RESTRICT const entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
@ -282,7 +285,7 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
entropy->entropy += VP8LFastSLog2(entropy->sum);
entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;
}
#define ASM_START \
@ -344,8 +347,9 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
ASM_END_COMMON_0 \
ASM_END_COMMON_1
static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
uint32_t* pout, int size) {
static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa,
const uint32_t* WEBP_RESTRICT pb,
uint32_t* WEBP_RESTRICT pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end;
@ -356,7 +360,8 @@ static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
}
static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa,
uint32_t* WEBP_RESTRICT pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end;

View File

@ -78,8 +78,9 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
static void TransformColor_MIPSdspR2(
const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT data,
int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
@ -171,13 +172,10 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff);
}
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_blue,
int red_to_blue,
int histo[]) {
static void CollectColorBlueTransforms_MIPSdspR2(
const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue, uint32_t histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
@ -225,12 +223,9 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
return (new_red & 0xff);
}
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_red,
int histo[]) {
static void CollectColorRedTransforms_MIPSdspR2(
const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_height, int green_to_red, uint32_t histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;

View File

@ -48,8 +48,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
static void TransformColor_MSA(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT data, int num_pixels) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));

View File

@ -72,8 +72,9 @@ static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor_NEON(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
static void TransformColor_NEON(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT argb_data,
int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {

View File

@ -49,8 +49,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void TransformColor_SSE2(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT argb_data,
int num_pixels) {
const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
CST_5b(m->green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
@ -79,10 +80,11 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
#define SPAN 8
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
static void CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
uint32_t histo[]) {
const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
@ -126,9 +128,10 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
}
}
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
int green_to_red, uint32_t histo[]) {
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi32(0xff);
@ -172,75 +175,113 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
#define LINE_SIZE 16 // 8 or 16
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
const uint32_t* WEBP_RESTRICT b,
uint32_t* WEBP_RESTRICT out, int size) {
int i = 0;
int aligned_size = size & ~15;
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
// 2). See the usage in VP8LHistogramAdd().
assert(size >= 16);
assert(size % 2 == 0);
do {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
i += 16;
} while (i != aligned_size);
if ((size & 8) != 0) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
i += 8;
}
for (; i < size; ++i) {
out[i] = a[i] + b[i];
size &= 7;
if (size == 4) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
} else if (size == 2) {
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]);
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
}
}
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
int i;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
uint32_t* WEBP_RESTRICT out, int size) {
int i = 0;
int aligned_size = size & ~15;
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
// 2). See the usage in VP8LHistogramAdd().
assert(size >= 16);
assert(size % 2 == 0);
do {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
i += 16;
} while (i != aligned_size);
if ((size & 8) != 0) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
i += 8;
}
for (; i < size; ++i) {
out[i] += a[i];
size &= 7;
if (size == 4) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]);
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
} else if (size == 2) {
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]);
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
}
}
#undef LINE_SIZE
//------------------------------------------------------------------------------
// Entropy
// TODO(https://crbug.com/webp/499): this function produces different results
// from the C code due to use of double/float resulting in output differences
// when compared to -noasm.
#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
static uint64_t CombinedShannonEntropy_SSE2(const uint32_t X[256],
const uint32_t Y[256]) {
int i;
float retval = 0.f;
int sumX = 0, sumXY = 0;
uint64_t retval = 0;
uint32_t sumX = 0, sumXY = 0;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i < 256; i += 16) {
@ -260,19 +301,19 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
while (my) {
const int32_t j = BitsCtz(my);
int xy;
uint32_t xy;
if ((mx >> j) & 1) {
const int x = X[i + j];
sumXY += x;
retval -= VP8LFastSLog2(x);
retval += VP8LFastSLog2(x);
}
xy = X[i + j] + Y[i + j];
sumX += xy;
retval -= VP8LFastSLog2(xy);
retval += VP8LFastSLog2(xy);
my &= my - 1;
}
}
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
return retval;
}
@ -335,8 +376,9 @@ static int VectorMismatch_SSE2(const uint32_t* const array1,
}
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
uint32_t* dst) {
static void BundleColorMap_SSE2(const uint8_t* WEBP_RESTRICT const row,
int width, int xbits,
uint32_t* WEBP_RESTRICT dst) {
int x;
assert(xbits >= 0);
assert(xbits <= 3);
@ -425,7 +467,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
// Predictor0: ARGB_BLACK.
static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -442,7 +484,8 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorSub##X##_SSE2(const uint32_t* const in, \
const uint32_t* const upper, \
int num_pixels, uint32_t* const out) { \
int num_pixels, \
uint32_t* WEBP_RESTRICT const out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
@ -464,7 +507,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL
// Predictor5: avg2(avg2(L, TR), T)
static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -484,7 +527,8 @@ static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_2(X, A, B) \
static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
@ -508,7 +552,7 @@ GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)
// Predictor10: avg(avg(L,TL), avg(T, TR)).
static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -543,7 +587,7 @@ static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
}
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -569,7 +613,7 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
// Predictor12: ClampedSubSubtractFull.
static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -598,7 +642,7 @@ static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
// Predictors13: ClampedAddSubtractHalf
static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 2 <= num_pixels; i += 2) {

View File

@ -44,8 +44,9 @@ static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
return HorizontalSum_SSE41(cost);
}
static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
const uint32_t* const b, int length) {
static uint32_t ExtraCostCombined_SSE41(const uint32_t* WEBP_RESTRICT const a,
const uint32_t* WEBP_RESTRICT const b,
int length) {
int i;
__m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
_mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
@ -95,10 +96,11 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
uint32_t histo[]) {
const __m128i mult =
MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
const __m128i perm =
@ -141,10 +143,11 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
}
}
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
int green_to_red,
uint32_t histo[]) {
const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
if (tile_width >= 4) {

View File

@ -26,8 +26,8 @@
#if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -53,8 +53,8 @@ static void ConvertBGRAToBGR_NEON(const uint32_t* src,
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -71,8 +71,8 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src,
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@ -209,7 +209,7 @@ static uint32_t Predictor13_NEON(const uint32_t* const left,
// Predictor0: ARGB_BLACK.
static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -222,7 +222,7 @@ static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
// Predictor1: left.
static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const uint8x16_t zero = LOADQ_U32_AS_U8(0);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -248,7 +248,7 @@ static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_NEON(const uint32_t* in, \
const uint32_t* upper, int num_pixels, \
uint32_t* out) { \
uint32_t* WEBP_RESTRICT out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
@ -276,7 +276,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1])
} while (0)
static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -301,7 +301,7 @@ static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
// Predictor6: average(left, TL)
static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -317,7 +317,7 @@ static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
// Predictor7: average(left, T)
static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -335,7 +335,7 @@ static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_2(X, IN) \
static void PredictorAdd##X##_NEON(const uint32_t* in, \
const uint32_t* upper, int num_pixels, \
uint32_t* out) { \
uint32_t* WEBP_RESTRICT out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
@ -363,7 +363,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
} while (0)
static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -394,7 +394,7 @@ static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
} while (0)
static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -427,7 +427,7 @@ static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
} while (0)
static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -468,7 +468,7 @@ static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
} while (0)
static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {

View File

@ -186,7 +186,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left,
// Predictor0: ARGB_BLACK.
static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -202,7 +202,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
// Predictor1: left.
static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
__m128i prev = _mm_set1_epi32((int)out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -230,7 +230,8 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
// per 8 bit channel.
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
@ -259,7 +260,8 @@ GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
#define GENERATE_PREDICTOR_2(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
@ -297,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
} while (0)
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
__m128i L = _mm_cvtsi32_si128((int)out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -344,7 +346,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
} while (0)
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
__m128i pa;
__m128i L = _mm_cvtsi32_si128((int)out[-1]);
@ -395,7 +397,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
} while (0)
static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
@ -490,8 +492,8 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
// Color-space conversion functions
static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
uint8_t* dst) {
static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
@ -526,8 +528,8 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
}
}
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
@ -554,8 +556,9 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
}
}
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels,
uint8_t* WEBP_RESTRICT dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
const __m128i* in = (const __m128i*)src;
@ -590,8 +593,9 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
}
}
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels,
uint8_t* WEBP_RESTRICT dst) {
const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@ -631,8 +635,8 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
}
}
static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src;

View File

@ -77,8 +77,8 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
} \
} while (0)
static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
uint8_t* dst) {
static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
@ -95,8 +95,8 @@ static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
}
}
static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,

View File

@ -26,8 +26,8 @@
//------------------------------------------------------------------------------
// Row import
void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
const uint8_t* src) {
void WebPRescalerImportRowExpand_C(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@ -59,8 +59,8 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
}
}
void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
const uint8_t* src) {
void WebPRescalerImportRowShrink_C(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@ -158,7 +158,8 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
//------------------------------------------------------------------------------
// Main entry calls
void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
void WebPRescalerImportRow(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
assert(!WebPRescalerInputDone(wrk));
if (!wrk->x_expand) {
WebPRescalerImportRowShrink(wrk, src);

View File

@ -21,8 +21,8 @@
//------------------------------------------------------------------------------
// Row import
static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
static void ImportRowShrink_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale;
@ -81,8 +81,8 @@ static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
}
}
static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
static void ImportRowExpand_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;

View File

@ -114,9 +114,9 @@
dst = __msa_copy_s_w((v4i32)t0, 0); \
} while (0)
static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
int length,
WebPRescaler* const wrk) {
static WEBP_INLINE void ExportRowExpand_0(
const uint32_t* WEBP_RESTRICT frow, uint8_t* WEBP_RESTRICT dst, int length,
WebPRescaler* WEBP_RESTRICT const wrk) {
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 };
@ -171,9 +171,10 @@ static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
}
}
static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
uint8_t* dst, int length,
WebPRescaler* const wrk) {
static WEBP_INLINE void ExportRowExpand_1(
const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
uint8_t* WEBP_RESTRICT dst, int length,
WebPRescaler* WEBP_RESTRICT const wrk) {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
const v4i32 B1 = __msa_fill_w(B);
@ -262,10 +263,10 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
}
#if 0 // disabled for now. TODO(skal): make match the C-code
static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
uint8_t* dst, int length,
const uint32_t yscale,
WebPRescaler* const wrk) {
static WEBP_INLINE void ExportRowShrink_0(
const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
uint8_t* WEBP_RESTRICT dst, int length, const uint32_t yscale,
WebPRescaler* WEBP_RESTRICT const wrk) {
const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
@ -348,9 +349,9 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
}
}
static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
int length,
WebPRescaler* const wrk) {
static WEBP_INLINE void ExportRowShrink_1(
uint32_t* WEBP_RESTRICT irow, uint8_t* WEBP_RESTRICT dst, int length,
WebPRescaler* WEBP_RESTRICT const wrk) {
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 };

View File

@ -45,8 +45,8 @@
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif
static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
const rescaler_t* const irow,
static uint32x4_t Interpolate_NEON(const rescaler_t* WEBP_RESTRICT const frow,
const rescaler_t* WEBP_RESTRICT const irow,
uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0);

View File

@ -43,8 +43,8 @@ static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
*out = _mm_unpacklo_epi8(A, zero);
}
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
static void RescalerImportRowExpand_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@ -109,8 +109,8 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
assert(accum == 0);
}
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
static void RescalerImportRowShrink_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
@ -168,12 +168,10 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
// Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
static WEBP_INLINE void LoadDispatchAndMult_SSE2(
const rescaler_t* WEBP_RESTRICT const src, const __m128i* const mult,
__m128i* const out0, __m128i* const out1, __m128i* const out2,
__m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32);

View File

@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
#if !defined(FANCY_UPSAMPLING)
#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* bot_u, const uint8_t* bot_v, \
uint8_t* top_dst, uint8_t* bot_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT bot_u, \
const uint8_t* WEBP_RESTRICT bot_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
const int half_len = len >> 1; \
int x; \
assert(top_dst != NULL); \
@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len); \
void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
}

View File

@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}

View File

@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
}
#endif // WEBP_REDUCE_CSP
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1)
@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1)
@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} while (0)
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bot_dst, int len) \
{ \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
int size = (len - 1) >> 1; \
uint8_t temp_u[64]; \
uint8_t temp_v[64]; \

View File

@ -58,8 +58,9 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
uint8_t* out) {
static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1,
const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
@ -189,57 +190,61 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
} \
}
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
const int uv_len = (len + 1) >> 1; \
/* 9 pixels must be read-able for each block */ \
const int num_blocks = (uv_len - 1) >> 3; \
const int leftover = uv_len - num_blocks * 8; \
const int last_pos = 1 + 16 * num_blocks; \
\
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
\
const int16x4_t coeff1 = vld1_s16(kCoeffs1); \
const int16x8_t R_Rounder = vdupq_n_s16(-14234); \
const int16x8_t G_Rounder = vdupq_n_s16(8708); \
const int16x8_t B_Rounder = vdupq_n_s16(-17685); \
\
/* Treat the first pixel in regular way */ \
assert(top_y != NULL); \
{ \
const int u0 = (top_u[0] + u_diag) >> 1; \
const int v0 = (top_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \
} \
if (bottom_y != NULL) { \
const int u0 = (cur_u[0] + u_diag) >> 1; \
const int v0 = (cur_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \
} \
\
for (block = 0; block < num_blocks; ++block) { \
UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \
UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \
CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, 16 * block + 1, 16); \
top_u += 8; \
cur_u += 8; \
top_v += 8; \
cur_v += 8; \
} \
\
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, last_pos, len - last_pos); \
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \
const int uv_len = (len + 1) >> 1; \
/* 9 pixels must be read-able for each block */ \
const int num_blocks = (uv_len - 1) >> 3; \
const int leftover = uv_len - num_blocks * 8; \
const int last_pos = 1 + 16 * num_blocks; \
\
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
\
const int16x4_t coeff1 = vld1_s16(kCoeffs1); \
const int16x8_t R_Rounder = vdupq_n_s16(-14234); \
const int16x8_t G_Rounder = vdupq_n_s16(8708); \
const int16x8_t B_Rounder = vdupq_n_s16(-17685); \
\
/* Treat the first pixel in regular way */ \
assert(top_y != NULL); \
{ \
const int u0 = (top_u[0] + u_diag) >> 1; \
const int v0 = (top_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \
} \
if (bottom_y != NULL) { \
const int u0 = (cur_u[0] + u_diag) >> 1; \
const int v0 = (cur_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \
} \
\
for (block = 0; block < num_blocks; ++block) { \
UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \
UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \
CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, 16 * block + 1, 16); \
top_u += 8; \
cur_u += 8; \
top_v += 8; \
cur_v += 8; \
} \
\
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, last_pos, len - last_pos); \
}
// NEON variants of the fancy upsampler.

View File

@ -88,8 +88,9 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1,
const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
} while (0)
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \

View File

@ -90,8 +90,9 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1,
const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@ -116,14 +117,18 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
} while (0)
#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \
uint8_t* const r_v = r_u + 32; \
\
assert(top_y != NULL); \
@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE41(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \

View File

@ -20,9 +20,10 @@
// Plain-C version
#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \
@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2)
#undef ROW_FUNC
// Main call for processing a plane with a WebPSamplerRowFunc function:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride,
uint8_t* dst, int dst_stride,
void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func) {
int j;
for (j = 0; j < height; ++j) {
@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
//-----------------------------------------------------------------------------
// ARGB -> YUV converters
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t p = argb[i];
@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
}
}
void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
// No rounding. Last pixel is dealt with separately.
const int uv_width = src_width >> 1;
@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
//-----------------------------------------------------------------------------
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int width) {
int i;
for (i = 0; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2];
@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int src_width, int do_store);
extern void WebPInitConvertARGBToYUVSSE2(void);

View File

@ -11,15 +11,15 @@
//
// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
// More information at: https://en.wikipedia.org/wiki/YCbCr
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
// Y = 0.2568 * R + 0.5041 * G + 0.0979 * B + 16
// U = -0.1482 * R - 0.2910 * G + 0.4392 * B + 128
// V = 0.4392 * R - 0.3678 * G - 0.0714 * B + 128
// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
//
// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
// R = 1.164 * (Y-16) + 1.596 * (V-128)
// G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
// B = 1.164 * (Y-16) + 2.018 * (U-128)
// G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.392 * (U-128)
// B = 1.164 * (Y-16) + 2.017 * (U-128)
// where Y is in the [16,235] range, and U/V in the [16,240] range.
//
// The fixed-point implementation used here is:
@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
#endif // WEBP_USE_SSE2
@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
#if defined(WEBP_USE_SSE41)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
#endif // WEBP_USE_SSE41

View File

@ -22,9 +22,10 @@
// simple point-sampling
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i, r, g, b; \
int temp0, temp1, temp2, temp3, temp4; \
for (i = 0; i < (len >> 1); i++) { \

View File

@ -69,9 +69,10 @@
: "memory", "hi", "lo" \
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
const int t_con_1 = 26149; \

View File

@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
return vqmovn_u16(Y2);
}
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8x3_t RGB = vld3_u8(rgb);
@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8x3_t BGR = vld3_u8(bgr);
@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
} while (0)
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
}
}
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
int i;
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {

View File

@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y,
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
uint8_t* WEBP_RESTRICT const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
}
// Pack R/G/B/A results into 16b output.
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
static WEBP_INLINE void PackAndStore4444_SSE2(
const __m128i* const R, const __m128i* const G, const __m128i* const B,
const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A);
@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
uint8_t* const dst) {
uint8_t* WEBP_RESTRICT const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B);
@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
uint8_t* WEBP_RESTRICT const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst) {
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
}
}
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbaRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
}
}
static void YuvToBgraRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
}
}
static void YuvToArgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
}
}
static void YuvToRgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
}
}
static void YuvToBgrRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
}
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
__m128i* const rgb /*in[6]*/) {
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A,
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE2(const uint32_t* argb,
uint8_t* u, uint8_t* v,
static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
const uint16_t* const rgbx,
const uint16_t* WEBP_RESTRICT const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
*b = _mm_unpacklo_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {

View File

@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y,
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y,
static WEBP_INLINE void PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
uint8_t* WEBP_RESTRICT const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41(
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y,
}
}
static void YuvToBgrRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A,
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE41(const uint32_t* argb,
uint8_t* u, uint8_t* v,
static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb,
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
const uint16_t* const rgbx,
const uint16_t* WEBP_RESTRICT const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
*b = _mm_unpackhi_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {

View File

@ -276,6 +276,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
stats->lossless_features = best.stats.lossless_features;
stats->histogram_bits = best.stats.histogram_bits;
stats->transform_bits = best.stats.transform_bits;
stats->cross_color_transform_bits = best.stats.cross_color_transform_bits;
stats->cache_bits = best.stats.cache_bits;
stats->palette_size = best.stats.palette_size;
stats->lossless_size = best.stats.lossless_size;

View File

@ -15,7 +15,7 @@
//
#include <assert.h>
#include <float.h>
#include <string.h>
#include "src/dsp/lossless_common.h"
#include "src/enc/backward_references_enc.h"
@ -31,15 +31,15 @@ extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v);
typedef struct {
float alpha_[VALUES_IN_BYTE];
float red_[VALUES_IN_BYTE];
float blue_[VALUES_IN_BYTE];
float distance_[NUM_DISTANCE_CODES];
float* literal_;
uint32_t alpha_[VALUES_IN_BYTE];
uint32_t red_[VALUES_IN_BYTE];
uint32_t blue_[VALUES_IN_BYTE];
uint32_t distance_[NUM_DISTANCE_CODES];
uint32_t* literal_;
} CostModel;
static void ConvertPopulationCountTableToBitEstimates(
int num_symbols, const uint32_t population_counts[], float output[]) {
int num_symbols, const uint32_t population_counts[], uint32_t output[]) {
uint32_t sum = 0;
int nonzeros = 0;
int i;
@ -52,7 +52,7 @@ static void ConvertPopulationCountTableToBitEstimates(
if (nonzeros <= 1) {
memset(output, 0, num_symbols * sizeof(*output));
} else {
const float logsum = VP8LFastLog2(sum);
const uint32_t logsum = VP8LFastLog2(sum);
for (i = 0; i < num_symbols; ++i) {
output[i] = logsum - VP8LFastLog2(population_counts[i]);
}
@ -93,47 +93,47 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
return ok;
}
static WEBP_INLINE float GetLiteralCost(const CostModel* const m, uint32_t v) {
return m->alpha_[v >> 24] +
m->red_[(v >> 16) & 0xff] +
m->literal_[(v >> 8) & 0xff] +
m->blue_[v & 0xff];
static WEBP_INLINE int64_t GetLiteralCost(const CostModel* const m,
uint32_t v) {
return (int64_t)m->alpha_[v >> 24] + m->red_[(v >> 16) & 0xff] +
m->literal_[(v >> 8) & 0xff] + m->blue_[v & 0xff];
}
static WEBP_INLINE float GetCacheCost(const CostModel* const m, uint32_t idx) {
static WEBP_INLINE int64_t GetCacheCost(const CostModel* const m,
uint32_t idx) {
const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
return m->literal_[literal_idx];
return (int64_t)m->literal_[literal_idx];
}
static WEBP_INLINE float GetLengthCost(const CostModel* const m,
uint32_t length) {
static WEBP_INLINE int64_t GetLengthCost(const CostModel* const m,
uint32_t length) {
int code, extra_bits;
VP8LPrefixEncodeBits(length, &code, &extra_bits);
return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
return (int64_t)m->literal_[VALUES_IN_BYTE + code] +
((int64_t)extra_bits << LOG_2_PRECISION_BITS);
}
static WEBP_INLINE float GetDistanceCost(const CostModel* const m,
uint32_t distance) {
static WEBP_INLINE int64_t GetDistanceCost(const CostModel* const m,
uint32_t distance) {
int code, extra_bits;
VP8LPrefixEncodeBits(distance, &code, &extra_bits);
return m->distance_[code] + extra_bits;
return (int64_t)m->distance_[code] +
((int64_t)extra_bits << LOG_2_PRECISION_BITS);
}
static WEBP_INLINE void AddSingleLiteralWithCostModel(
const uint32_t* const argb, VP8LColorCache* const hashers,
const CostModel* const cost_model, int idx, int use_color_cache,
float prev_cost, float* const cost, uint16_t* const dist_array) {
float cost_val = prev_cost;
int64_t prev_cost, int64_t* const cost, uint16_t* const dist_array) {
int64_t cost_val = prev_cost;
const uint32_t color = argb[idx];
const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
if (ix >= 0) {
// use_color_cache is true and hashers contains color
const float mul0 = 0.68f;
cost_val += GetCacheCost(cost_model, ix) * mul0;
cost_val += DivRound(GetCacheCost(cost_model, ix) * 68, 100);
} else {
const float mul1 = 0.82f;
if (use_color_cache) VP8LColorCacheInsert(hashers, color);
cost_val += GetLiteralCost(cost_model, color) * mul1;
cost_val += DivRound(GetLiteralCost(cost_model, color) * 82, 100);
}
if (cost[idx] > cost_val) {
cost[idx] = cost_val;
@ -163,7 +163,7 @@ static WEBP_INLINE void AddSingleLiteralWithCostModel(
// therefore no overlapping intervals.
typedef struct CostInterval CostInterval;
struct CostInterval {
float cost_;
int64_t cost_;
int start_;
int end_;
int index_;
@ -173,7 +173,7 @@ struct CostInterval {
// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
typedef struct {
float cost_;
int64_t cost_;
int start_;
int end_; // Exclusive.
} CostCacheInterval;
@ -188,8 +188,9 @@ typedef struct {
int count_; // The number of stored intervals.
CostCacheInterval* cache_intervals_;
size_t cache_intervals_size_;
float cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k).
float* costs_;
// Contains the GetLengthCost(cost_model, k).
int64_t cost_cache_[MAX_LENGTH];
int64_t* costs_;
uint16_t* dist_array_;
// Most of the time, we only need few intervals -> use a free-list, to avoid
// fragmentation with small allocs in most common cases.
@ -298,7 +299,7 @@ static int CostManagerInit(CostManager* const manager,
cur->end_ = 1;
cur->cost_ = manager->cost_cache_[0];
for (i = 1; i < cost_cache_size; ++i) {
const float cost_val = manager->cost_cache_[i];
const int64_t cost_val = manager->cost_cache_[i];
if (cost_val != cur->cost_) {
++cur;
// Initialize an interval.
@ -311,13 +312,15 @@ static int CostManagerInit(CostManager* const manager,
manager->cache_intervals_size_);
}
manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
manager->costs_ =
(int64_t*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
if (manager->costs_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Set the initial costs_ high for every pixel as we will keep the minimum.
for (i = 0; i < pix_count; ++i) manager->costs_[i] = FLT_MAX;
// Set the initial costs_ to INT64_MAX for every pixel as we will keep the
// minimum.
for (i = 0; i < pix_count; ++i) manager->costs_[i] = WEBP_INT64_MAX;
return 1;
}
@ -325,7 +328,7 @@ static int CostManagerInit(CostManager* const manager,
// Given the cost and the position that define an interval, update the cost at
// pixel 'i' if it is smaller than the previously computed value.
static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
int position, float cost) {
int position, int64_t cost) {
const int k = i - position;
assert(k >= 0 && k < MAX_LENGTH);
@ -339,7 +342,7 @@ static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
// all the pixels between 'start' and 'end' excluded.
static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
int start, int end, int position,
float cost) {
int64_t cost) {
int i;
for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
}
@ -424,7 +427,7 @@ static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
// interval_in as a hint. The intervals are sorted by start_ value.
static WEBP_INLINE void InsertInterval(CostManager* const manager,
CostInterval* const interval_in,
float cost, int position, int start,
int64_t cost, int position, int start,
int end) {
CostInterval* interval_new;
@ -463,7 +466,7 @@ static WEBP_INLINE void InsertInterval(CostManager* const manager,
// If handling the interval or one of its subintervals becomes to heavy, its
// contribution is added to the costs right away.
static WEBP_INLINE void PushInterval(CostManager* const manager,
float distance_cost, int position,
int64_t distance_cost, int position,
int len) {
size_t i;
CostInterval* interval = manager->head_;
@ -478,7 +481,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
int j;
for (j = position; j < position + len; ++j) {
const int k = j - position;
float cost_tmp;
int64_t cost_tmp;
assert(k >= 0 && k < MAX_LENGTH);
cost_tmp = distance_cost + manager->cost_cache_[k];
@ -498,7 +501,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
const int end = position + (cost_cache_intervals[i].end_ > len
? len
: cost_cache_intervals[i].end_);
const float cost = distance_cost + cost_cache_intervals[i].cost_;
const int64_t cost = distance_cost + cost_cache_intervals[i].cost_;
for (; interval != NULL && interval->start_ < end;
interval = interval_next) {
@ -576,7 +579,7 @@ static int BackwardReferencesHashChainDistanceOnly(
const int pix_count = xsize * ysize;
const int use_color_cache = (cache_bits > 0);
const size_t literal_array_size =
sizeof(float) * (VP8LHistogramNumCodes(cache_bits));
sizeof(*((CostModel*)NULL)->literal_) * VP8LHistogramNumCodes(cache_bits);
const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
CostModel* const cost_model =
(CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
@ -584,13 +587,13 @@ static int BackwardReferencesHashChainDistanceOnly(
CostManager* cost_manager =
(CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
int offset_prev = -1, len_prev = -1;
float offset_cost = -1.f;
int64_t offset_cost = -1;
int first_offset_is_constant = -1; // initialized with 'impossible' value
int reach = 0;
if (cost_model == NULL || cost_manager == NULL) goto Error;
cost_model->literal_ = (float*)(cost_model + 1);
cost_model->literal_ = (uint32_t*)(cost_model + 1);
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
@ -608,11 +611,12 @@ static int BackwardReferencesHashChainDistanceOnly(
// non-processed locations from this point.
dist_array[0] = 0;
// Add first pixel as literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
0.f, cost_manager->costs_, dist_array);
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, /*idx=*/0,
use_color_cache, /*prev_cost=*/0,
cost_manager->costs_, dist_array);
for (i = 1; i < pix_count; ++i) {
const float prev_cost = cost_manager->costs_[i - 1];
const int64_t prev_cost = cost_manager->costs_[i - 1];
int offset, len;
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);

View File

@ -13,8 +13,6 @@
#include "src/enc/backward_references_enc.h"
#include <assert.h>
#include <float.h>
#include <math.h>
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
@ -27,8 +25,6 @@
#define MIN_BLOCK_SIZE 256 // minimum block size for backward references
#define MAX_ENTROPY (1e30f)
// 1M window (4M bytes) minus 120 special codes for short distances.
#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
@ -758,7 +754,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
int* const best_cache_bits) {
int i;
const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
float entropy_min = MAX_ENTROPY;
uint64_t entropy_min = WEBP_UINT64_MAX;
int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
@ -843,7 +839,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
}
for (i = 0; i <= cache_bits_max; ++i) {
const float entropy = VP8LHistogramEstimateBits(histos[i]);
const uint64_t entropy = VP8LHistogramEstimateBits(histos[i]);
if (i == 0 || entropy < entropy_min) {
entropy_min = entropy;
*best_cache_bits = i;
@ -920,7 +916,7 @@ static int GetBackwardReferences(int width, int height,
int i, lz77_type;
// Index 0 is for a color cache, index 1 for no cache (if needed).
int lz77_types_best[2] = {0, 0};
float bit_costs_best[2] = {FLT_MAX, FLT_MAX};
uint64_t bit_costs_best[2] = {WEBP_UINT64_MAX, WEBP_UINT64_MAX};
VP8LHashChain hash_chain_box;
VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
int status = 0;
@ -932,7 +928,7 @@ static int GetBackwardReferences(int width, int height,
for (lz77_type = 1; lz77_types_to_try;
lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
int res = 0;
float bit_cost = 0.f;
uint64_t bit_cost = 0u;
if ((lz77_types_to_try & lz77_type) == 0) continue;
switch (lz77_type) {
case kLZ77RLE:
@ -1006,7 +1002,7 @@ static int GetBackwardReferences(int width, int height,
const VP8LHashChain* const hash_chain_tmp =
(lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
float bit_cost_trace;
uint64_t bit_cost_trace;
if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
hash_chain_tmp, &refs[i],
refs_tmp)) {

View File

@ -55,7 +55,6 @@ int WebPConfigInitInternal(WebPConfig* config,
config->thread_level = 0;
config->low_memory = 0;
config->near_lossless = 100;
config->use_delta_palette = 0;
config->use_sharp_yuv = 0;
// TODO(skal): tune.
@ -125,9 +124,6 @@ int WebPValidateConfig(const WebPConfig* config) {
if (config->thread_level < 0 || config->thread_level > 1) return 0;
if (config->low_memory < 0 || config->low_memory > 1) return 0;
if (config->exact < 0 || config->exact > 1) return 0;
if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
return 0;
}
if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
return 1;

View File

@ -19,7 +19,7 @@
// For each given level, the following table gives the pattern of contexts to
// use for coding it (in [][0]) as well as the bit value to use for each
// context (in [][1]).
const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
static const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
{0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
{0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
{0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},

View File

@ -61,7 +61,6 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
}
// Level cost calculations
extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
void VP8CalculateLevelCosts(VP8EncProba* const proba);
static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
return VP8LevelFixedCosts[level]

View File

@ -13,8 +13,7 @@
#include "src/webp/config.h"
#endif
#include <float.h>
#include <math.h>
#include <string.h>
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
@ -23,8 +22,6 @@
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
#define MAX_BIT_COST FLT_MAX
// Number of partitions for the three dominant (literal, red and blue) symbol
// costs.
#define NUM_PARTITIONS 4
@ -33,10 +30,18 @@
// Maximum number of histograms allowed in greedy combining algorithm.
#define MAX_HISTO_GREEDY 100
// Return the size of the histogram for a given cache_bits.
static int GetHistogramSize(int cache_bits) {
const int literal_size = VP8LHistogramNumCodes(cache_bits);
const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
assert(total_size <= (size_t)0x7fffffff);
return (int)total_size;
}
static void HistogramClear(VP8LHistogram* const p) {
uint32_t* const literal = p->literal_;
const int cache_bits = p->palette_code_bits_;
const int histo_size = VP8LGetHistogramSize(cache_bits);
const int histo_size = GetHistogramSize(cache_bits);
memset(p, 0, histo_size);
p->palette_code_bits_ = cache_bits;
p->literal_ = literal;
@ -54,20 +59,13 @@ static void HistogramCopy(const VP8LHistogram* const src,
uint32_t* const dst_literal = dst->literal_;
const int dst_cache_bits = dst->palette_code_bits_;
const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
const int histo_size = GetHistogramSize(dst_cache_bits);
assert(src->palette_code_bits_ == dst_cache_bits);
memcpy(dst, src, histo_size);
dst->literal_ = dst_literal;
memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
}
int VP8LGetHistogramSize(int cache_bits) {
const int literal_size = VP8LHistogramNumCodes(cache_bits);
const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
assert(total_size <= (size_t)0x7fffffff);
return (int)total_size;
}
void VP8LFreeHistogram(VP8LHistogram* const histo) {
WebPSafeFree(histo);
}
@ -102,17 +100,17 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
HistogramClear(p);
} else {
p->trivial_symbol_ = 0;
p->bit_cost_ = 0.;
p->literal_cost_ = 0.;
p->red_cost_ = 0.;
p->blue_cost_ = 0.;
p->bit_cost_ = 0;
p->literal_cost_ = 0;
p->red_cost_ = 0;
p->blue_cost_ = 0;
memset(p->is_used_, 0, sizeof(p->is_used_));
}
}
VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
VP8LHistogram* histo = NULL;
const int total_size = VP8LGetHistogramSize(cache_bits);
const int total_size = GetHistogramSize(cache_bits);
uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
if (memory == NULL) return NULL;
histo = (VP8LHistogram*)memory;
@ -126,7 +124,7 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
static void HistogramSetResetPointers(VP8LHistogramSet* const set,
int cache_bits) {
int i;
const int histo_size = VP8LGetHistogramSize(cache_bits);
const int histo_size = GetHistogramSize(cache_bits);
uint8_t* memory = (uint8_t*) (set->histograms);
memory += set->max_size * sizeof(*set->histograms);
for (i = 0; i < set->max_size; ++i) {
@ -140,7 +138,7 @@ static void HistogramSetResetPointers(VP8LHistogramSet* const set,
// Returns the total size of the VP8LHistogramSet.
static size_t HistogramSetTotalSize(int size, int cache_bits) {
const int histo_size = VP8LGetHistogramSize(cache_bits);
const int histo_size = GetHistogramSize(cache_bits);
return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
histo_size + WEBP_ALIGN_CST));
}
@ -230,8 +228,8 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
// -----------------------------------------------------------------------------
// Entropy-related functions.
static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
float mix;
static WEBP_INLINE uint64_t BitsEntropyRefine(const VP8LBitEntropy* entropy) {
uint64_t mix;
if (entropy->nonzeros < 5) {
if (entropy->nonzeros <= 1) {
return 0;
@ -240,67 +238,72 @@ static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
// Let's mix in a bit of entropy to favor good clustering when
// distributions of these are combined.
if (entropy->nonzeros == 2) {
return 0.99f * entropy->sum + 0.01f * entropy->entropy;
return DivRound(99 * ((uint64_t)entropy->sum << LOG_2_PRECISION_BITS) +
entropy->entropy,
100);
}
// No matter what the entropy says, we cannot be better than min_limit
// with Huffman coding. I am mixing a bit of entropy into the
// min_limit since it produces much better (~0.5 %) compression results
// perhaps because of better entropy clustering.
if (entropy->nonzeros == 3) {
mix = 0.95f;
mix = 950;
} else {
mix = 0.7f; // nonzeros == 4.
mix = 700; // nonzeros == 4.
}
} else {
mix = 0.627f;
mix = 627;
}
{
float min_limit = 2.f * entropy->sum - entropy->max_val;
min_limit = mix * min_limit + (1.f - mix) * entropy->entropy;
uint64_t min_limit = (uint64_t)(2 * entropy->sum - entropy->max_val)
<< LOG_2_PRECISION_BITS;
min_limit =
DivRound(mix * min_limit + (1000 - mix) * entropy->entropy, 1000);
return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
}
}
float VP8LBitsEntropy(const uint32_t* const array, int n) {
uint64_t VP8LBitsEntropy(const uint32_t* const array, int n) {
VP8LBitEntropy entropy;
VP8LBitsEntropyUnrefined(array, n, &entropy);
return BitsEntropyRefine(&entropy);
}
static float InitialHuffmanCost(void) {
static uint64_t InitialHuffmanCost(void) {
// Small bias because Huffman code length is typically not stored in
// full length.
static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
static const float kSmallBias = 9.1f;
return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
static const uint64_t kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
// Subtract a bias of 9.1.
return (kHuffmanCodeOfHuffmanCodeSize << LOG_2_PRECISION_BITS) -
DivRound(91ll << LOG_2_PRECISION_BITS, 10);
}
// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
static float FinalHuffmanCost(const VP8LStreaks* const stats) {
// The constants in this function are experimental and got rounded from
static uint64_t FinalHuffmanCost(const VP8LStreaks* const stats) {
// The constants in this function are empirical and got rounded from
// their original values in 1/8 when switched to 1/1024.
float retval = InitialHuffmanCost();
uint64_t retval = InitialHuffmanCost();
// Second coefficient: Many zeros in the histogram are covered efficiently
// by a run-length encode. Originally 2/8.
retval += stats->counts[0] * 1.5625f + 0.234375f * stats->streaks[0][1];
uint32_t retval_extra = stats->counts[0] * 1600 + 240 * stats->streaks[0][1];
// Second coefficient: Constant values are encoded less efficiently, but still
// RLE'ed. Originally 6/8.
retval += stats->counts[1] * 2.578125f + 0.703125f * stats->streaks[1][1];
retval_extra += stats->counts[1] * 2640 + 720 * stats->streaks[1][1];
// 0s are usually encoded more efficiently than non-0s.
// Originally 15/8.
retval += 1.796875f * stats->streaks[0][0];
retval_extra += 1840 * stats->streaks[0][0];
// Originally 26/8.
retval += 3.28125f * stats->streaks[1][0];
return retval;
retval_extra += 3360 * stats->streaks[1][0];
return retval + ((uint64_t)retval_extra << (LOG_2_PRECISION_BITS - 10));
}
// Get the symbol entropy for the distribution 'population'.
// Set 'trivial_sym', if there's only one symbol present in the distribution.
static float PopulationCost(const uint32_t* const population, int length,
uint32_t* const trivial_sym,
uint8_t* const is_used) {
static uint64_t PopulationCost(const uint32_t* const population, int length,
uint32_t* const trivial_sym,
uint8_t* const is_used) {
VP8LBitEntropy bit_entropy;
VP8LStreaks stats;
VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@ -316,10 +319,11 @@ static float PopulationCost(const uint32_t* const population, int length,
// trivial_at_end is 1 if the two histograms only have one element that is
// non-zero: both the zero-th one, or both the last one.
static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
const uint32_t* const Y, int length,
int is_X_used, int is_Y_used,
int trivial_at_end) {
static WEBP_INLINE uint64_t GetCombinedEntropy(const uint32_t* const X,
const uint32_t* const Y,
int length, int is_X_used,
int is_Y_used,
int trivial_at_end) {
VP8LStreaks stats;
if (trivial_at_end) {
// This configuration is due to palettization that transforms an indexed
@ -357,7 +361,7 @@ static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
}
// Estimates the Entropy + Huffman + other block overhead size cost.
float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
uint64_t VP8LHistogramEstimateBits(VP8LHistogram* const p) {
return PopulationCost(p->literal_,
VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
&p->is_used_[0]) +
@ -366,27 +370,42 @@ float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
&p->is_used_[4]) +
(float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES) +
(float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
((uint64_t)(VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES) +
VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES))
<< LOG_2_PRECISION_BITS);
}
// -----------------------------------------------------------------------------
// Various histogram combine/cost-eval functions
static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
const VP8LHistogram* const b,
float cost_threshold, float* cost) {
// Set a + b in b, saturating at WEBP_INT64_MAX.
static WEBP_INLINE void SaturateAdd(uint64_t a, int64_t* b) {
if (*b < 0 || (int64_t)a <= WEBP_INT64_MAX - *b) {
*b += (int64_t)a;
} else {
*b = WEBP_INT64_MAX;
}
}
// Returns 1 if the cost of the combined histogram is less than the threshold.
// Otherwise returns 0 and the cost is invalid due to early bail-out.
WEBP_NODISCARD static int GetCombinedHistogramEntropy(
const VP8LHistogram* const a, const VP8LHistogram* const b,
int64_t cost_threshold_in, uint64_t* cost) {
const int palette_code_bits = a->palette_code_bits_;
int trivial_at_end = 0;
const uint64_t cost_threshold = (uint64_t)cost_threshold_in;
assert(a->palette_code_bits_ == b->palette_code_bits_);
*cost += GetCombinedEntropy(a->literal_, b->literal_,
VP8LHistogramNumCodes(palette_code_bits),
a->is_used_[0], b->is_used_[0], 0);
*cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
b->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES);
if (*cost > cost_threshold) return 0;
if (cost_threshold_in <= 0) return 0;
*cost = GetCombinedEntropy(a->literal_, b->literal_,
VP8LHistogramNumCodes(palette_code_bits),
a->is_used_[0], b->is_used_[0], 0);
*cost += (uint64_t)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
b->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES)
<< LOG_2_PRECISION_BITS;
if (*cost >= cost_threshold) return 0;
if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
a->trivial_symbol_ == b->trivial_symbol_) {
@ -401,27 +420,24 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
}
}
*cost +=
GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
b->is_used_[1], trivial_at_end);
if (*cost > cost_threshold) return 0;
*cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES,
a->is_used_[1], b->is_used_[1], trivial_at_end);
if (*cost >= cost_threshold) return 0;
*cost +=
GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
b->is_used_[2], trivial_at_end);
if (*cost > cost_threshold) return 0;
*cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES,
a->is_used_[2], b->is_used_[2], trivial_at_end);
if (*cost >= cost_threshold) return 0;
*cost +=
GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
a->is_used_[3], b->is_used_[3], trivial_at_end);
if (*cost > cost_threshold) return 0;
*cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
a->is_used_[3], b->is_used_[3], trivial_at_end);
if (*cost >= cost_threshold) return 0;
*cost +=
GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
a->is_used_[4], b->is_used_[4], 0);
*cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_,
NUM_DISTANCE_CODES);
if (*cost > cost_threshold) return 0;
*cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
a->is_used_[4], b->is_used_[4], 0);
*cost += (uint64_t)VP8LExtraCostCombined(a->distance_, b->distance_,
NUM_DISTANCE_CODES)
<< LOG_2_PRECISION_BITS;
if (*cost >= cost_threshold) return 0;
return 1;
}
@ -441,33 +457,39 @@ static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
// Since the previous score passed is 'cost_threshold', we only need to compare
// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
// early.
static float HistogramAddEval(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out, float cost_threshold) {
float cost = 0;
const float sum_cost = a->bit_cost_ + b->bit_cost_;
cost_threshold += sum_cost;
// Returns 1 if the cost is less than the threshold.
// Otherwise returns 0 and the cost is invalid due to early bail-out.
WEBP_NODISCARD static int HistogramAddEval(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out,
int64_t cost_threshold) {
uint64_t cost;
const uint64_t sum_cost = a->bit_cost_ + b->bit_cost_;
SaturateAdd(sum_cost, &cost_threshold);
if (!GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) return 0;
if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
HistogramAdd(a, b, out);
out->bit_cost_ = cost;
out->palette_code_bits_ = a->palette_code_bits_;
}
return cost - sum_cost;
HistogramAdd(a, b, out);
out->bit_cost_ = cost;
out->palette_code_bits_ = a->palette_code_bits_;
return 1;
}
// Same as HistogramAddEval(), except that the resulting histogram
// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
// the term C(b) which is constant over all the evaluations.
static float HistogramAddThresh(const VP8LHistogram* const a,
const VP8LHistogram* const b,
float cost_threshold) {
float cost;
// Returns 1 if the cost is less than the threshold.
// Otherwise returns 0 and the cost is invalid due to early bail-out.
WEBP_NODISCARD static int HistogramAddThresh(const VP8LHistogram* const a,
const VP8LHistogram* const b,
int64_t cost_threshold,
int64_t* cost_out) {
uint64_t cost;
assert(a != NULL && b != NULL);
cost = -a->bit_cost_;
GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
return cost;
SaturateAdd(a->bit_cost_, &cost_threshold);
if (!GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) return 0;
*cost_out = (int64_t)cost - (int64_t)a->bit_cost_;
return 1;
}
// -----------------------------------------------------------------------------
@ -475,21 +497,21 @@ static float HistogramAddThresh(const VP8LHistogram* const a,
// The structure to keep track of cost range for the three dominant entropy
// symbols.
typedef struct {
float literal_max_;
float literal_min_;
float red_max_;
float red_min_;
float blue_max_;
float blue_min_;
uint64_t literal_max_;
uint64_t literal_min_;
uint64_t red_max_;
uint64_t red_min_;
uint64_t blue_max_;
uint64_t blue_min_;
} DominantCostRange;
static void DominantCostRangeInit(DominantCostRange* const c) {
c->literal_max_ = 0.;
c->literal_min_ = MAX_BIT_COST;
c->red_max_ = 0.;
c->red_min_ = MAX_BIT_COST;
c->blue_max_ = 0.;
c->blue_min_ = MAX_BIT_COST;
c->literal_max_ = 0;
c->literal_min_ = WEBP_UINT64_MAX;
c->red_max_ = 0;
c->red_min_ = WEBP_UINT64_MAX;
c->blue_max_ = 0;
c->blue_min_ = WEBP_UINT64_MAX;
}
static void UpdateDominantCostRange(
@ -504,15 +526,18 @@ static void UpdateDominantCostRange(
static void UpdateHistogramCost(VP8LHistogram* const h) {
uint32_t alpha_sym, red_sym, blue_sym;
const float alpha_cost =
const uint64_t alpha_cost =
PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
const float distance_cost =
const uint64_t distance_cost =
PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
(float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
((uint64_t)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES)
<< LOG_2_PRECISION_BITS);
const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
h->literal_cost_ =
PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
(float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
((uint64_t)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES)
<< LOG_2_PRECISION_BITS);
h->red_cost_ =
PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
h->blue_cost_ =
@ -527,10 +552,10 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
}
}
static int GetBinIdForEntropy(float min, float max, float val) {
const float range = max - min;
if (range > 0.) {
const float delta = val - min;
static int GetBinIdForEntropy(uint64_t min, uint64_t max, uint64_t val) {
const uint64_t range = max - min;
if (range > 0) {
const uint64_t delta = val - min;
return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
} else {
return 0;
@ -576,11 +601,11 @@ static void HistogramBuild(
}
// Copies the histograms and computes its bit_cost.
static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1);
static const uint32_t kInvalidHistogramSymbol = (uint32_t)(-1);
static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
VP8LHistogramSet* const image_histo,
int* const num_used,
uint16_t* const histogram_symbols) {
uint32_t* const histogram_symbols) {
int i, cluster_id;
int num_used_orig = *num_used;
VP8LHistogram** const orig_histograms = orig_histo->histograms;
@ -639,11 +664,12 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
// Merges some histograms with same bin_id together if it's advantageous.
// Sets the remaining histograms to NULL.
// 'combine_cost_factor' has to be divided by 100.
static void HistogramCombineEntropyBin(
VP8LHistogramSet* const image_histo, int* num_used,
const uint16_t* const clusters, uint16_t* const cluster_mappings,
const uint32_t* const clusters, uint16_t* const cluster_mappings,
VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
float combine_cost_factor, int low_effort) {
int32_t combine_cost_factor, int low_effort) {
VP8LHistogram** const histograms = image_histo->histograms;
int idx;
struct {
@ -673,11 +699,11 @@ static void HistogramCombineEntropyBin(
cluster_mappings[clusters[idx]] = clusters[first];
} else {
// try to merge #idx into #first (both share the same bin_id)
const float bit_cost = histograms[idx]->bit_cost_;
const float bit_cost_thresh = -bit_cost * combine_cost_factor;
const float curr_cost_diff = HistogramAddEval(
histograms[first], histograms[idx], cur_combo, bit_cost_thresh);
if (curr_cost_diff < bit_cost_thresh) {
const uint64_t bit_cost = histograms[idx]->bit_cost_;
const int64_t bit_cost_thresh =
-DivRound((int64_t)bit_cost * combine_cost_factor, 100);
if (HistogramAddEval(histograms[first], histograms[idx], cur_combo,
bit_cost_thresh)) {
// Try to merge two histograms only if the combo is a trivial one or
// the two candidate histograms are already non-trivial.
// For some images, 'try_combine' turns out to be false for a lot of
@ -724,8 +750,8 @@ static uint32_t MyRand(uint32_t* const seed) {
typedef struct {
int idx1;
int idx2;
float cost_diff;
float cost_combo;
int64_t cost_diff;
uint64_t cost_combo;
} HistogramPair;
typedef struct {
@ -765,7 +791,7 @@ static void HistoQueuePopPair(HistoQueue* const histo_queue,
// Check whether a pair in the queue should be updated as head or not.
static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
HistogramPair* const pair) {
assert(pair->cost_diff < 0.);
assert(pair->cost_diff < 0);
assert(pair >= histo_queue->queue &&
pair < (histo_queue->queue + histo_queue->size));
assert(histo_queue->size > 0);
@ -778,29 +804,35 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
}
// Update the cost diff and combo of a pair of histograms. This needs to be
// called when the the histograms have been merged with a third one.
static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
const VP8LHistogram* const h2, float threshold,
HistogramPair* const pair) {
const float sum_cost = h1->bit_cost_ + h2->bit_cost_;
pair->cost_combo = 0.;
GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
pair->cost_diff = pair->cost_combo - sum_cost;
// called when the histograms have been merged with a third one.
// Returns 1 if the cost diff is less than the threshold.
// Otherwise returns 0 and the cost is invalid due to early bail-out.
WEBP_NODISCARD static int HistoQueueUpdatePair(const VP8LHistogram* const h1,
const VP8LHistogram* const h2,
int64_t cost_threshold,
HistogramPair* const pair) {
const int64_t sum_cost = h1->bit_cost_ + h2->bit_cost_;
SaturateAdd(sum_cost, &cost_threshold);
if (!GetCombinedHistogramEntropy(h1, h2, cost_threshold, &pair->cost_combo)) {
return 0;
}
pair->cost_diff = (int64_t)pair->cost_combo - sum_cost;
return 1;
}
// Create a pair from indices "idx1" and "idx2" provided its cost
// is inferior to "threshold", a negative entropy.
// It returns the cost of the pair, or 0. if it superior to threshold.
static float HistoQueuePush(HistoQueue* const histo_queue,
VP8LHistogram** const histograms, int idx1,
int idx2, float threshold) {
// It returns the cost of the pair, or 0 if it superior to threshold.
static int64_t HistoQueuePush(HistoQueue* const histo_queue,
VP8LHistogram** const histograms, int idx1,
int idx2, int64_t threshold) {
const VP8LHistogram* h1;
const VP8LHistogram* h2;
HistogramPair pair;
// Stop here if the queue is full.
if (histo_queue->size == histo_queue->max_size) return 0.;
assert(threshold <= 0.);
if (histo_queue->size == histo_queue->max_size) return 0;
assert(threshold <= 0);
if (idx1 > idx2) {
const int tmp = idx2;
idx2 = idx1;
@ -811,10 +843,8 @@ static float HistoQueuePush(HistoQueue* const histo_queue,
h1 = histograms[idx1];
h2 = histograms[idx2];
HistoQueueUpdatePair(h1, h2, threshold, &pair);
// Do not even consider the pair if it does not improve the entropy.
if (pair.cost_diff >= threshold) return 0.;
if (!HistoQueueUpdatePair(h1, h2, threshold, &pair)) return 0;
histo_queue->queue[histo_queue->size++] = pair;
HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
@ -851,7 +881,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
for (j = i + 1; j < image_histo_size; ++j) {
// Initialize queue.
if (image_histo->histograms[j] == NULL) continue;
HistoQueuePush(&histo_queue, histograms, i, j, 0.);
HistoQueuePush(&histo_queue, histograms, i, j, 0);
}
}
@ -879,7 +909,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
// Push new pairs formed with combined histogram to the queue.
for (i = 0; i < image_histo->size; ++i) {
if (i == idx1 || image_histo->histograms[i] == NULL) continue;
HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.);
HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0);
}
}
@ -937,8 +967,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
++tries_with_no_success < num_tries_no_success;
++iter) {
int* mapping_index;
float best_cost =
(histo_queue.size == 0) ? 0.f : histo_queue.queue[0].cost_diff;
int64_t best_cost =
(histo_queue.size == 0) ? 0 : histo_queue.queue[0].cost_diff;
int best_idx1 = -1, best_idx2 = 1;
const uint32_t rand_range = (*num_used - 1) * (*num_used);
// (*num_used) / 2 was chosen empirically. Less means faster but worse
@ -947,7 +977,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
// Pick random samples.
for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
float curr_cost;
int64_t curr_cost;
// Choose two different histograms at random and try to combine them.
const uint32_t tmp = MyRand(&seed) % rand_range;
uint32_t idx1 = tmp / (*num_used - 1);
@ -1012,8 +1042,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
}
if (do_eval) {
// Re-evaluate the cost of an updated pair.
HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p);
if (p->cost_diff >= 0.) {
if (!HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0,
p)) {
HistoQueuePopPair(&histo_queue, p);
continue;
}
@ -1040,7 +1070,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
// Note: we assume that out[]->bit_cost_ is already up-to-date.
static void HistogramRemap(const VP8LHistogramSet* const in,
VP8LHistogramSet* const out,
uint16_t* const symbols) {
uint32_t* const symbols) {
int i;
VP8LHistogram** const in_histo = in->histograms;
VP8LHistogram** const out_histo = out->histograms;
@ -1049,7 +1079,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
if (out_size > 1) {
for (i = 0; i < in_size; ++i) {
int best_out = 0;
float best_bits = MAX_BIT_COST;
int64_t best_bits = WEBP_INT64_MAX;
int k;
if (in_histo[i] == NULL) {
// Arbitrarily set to the previous value if unused to help future LZ77.
@ -1057,9 +1087,9 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
continue;
}
for (k = 0; k < out_size; ++k) {
float cur_bits;
cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
if (k == 0 || cur_bits < best_bits) {
int64_t cur_bits;
if (HistogramAddThresh(out_histo[k], in_histo[i], best_bits,
&cur_bits)) {
best_bits = cur_bits;
best_out = k;
}
@ -1085,13 +1115,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
}
}
static float GetCombineCostFactor(int histo_size, int quality) {
float combine_cost_factor = 0.16f;
static int32_t GetCombineCostFactor(int histo_size, int quality) {
int32_t combine_cost_factor = 16;
if (quality < 90) {
if (histo_size > 256) combine_cost_factor /= 2.f;
if (histo_size > 512) combine_cost_factor /= 2.f;
if (histo_size > 1024) combine_cost_factor /= 2.f;
if (quality <= 50) combine_cost_factor /= 2.f;
if (histo_size > 256) combine_cost_factor /= 2;
if (histo_size > 512) combine_cost_factor /= 2;
if (histo_size > 1024) combine_cost_factor /= 2;
if (quality <= 50) combine_cost_factor /= 2;
}
return combine_cost_factor;
}
@ -1101,10 +1131,10 @@ static float GetCombineCostFactor(int histo_size, int quality) {
// assign the smallest possible clusters values.
static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
uint16_t* const cluster_mappings,
int num_clusters,
uint32_t num_clusters,
uint16_t* const cluster_mappings_tmp,
uint16_t* const symbols) {
int i, cluster_max;
uint32_t* const symbols) {
uint32_t i, cluster_max;
int do_continue = 1;
// First, assign the lowest cluster to each pixel.
while (do_continue) {
@ -1128,7 +1158,7 @@ static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
set->max_size * sizeof(*cluster_mappings_tmp));
assert(cluster_mappings[0] == 0);
// Re-map the ids.
for (i = 0; i < set->max_size; ++i) {
for (i = 0; i < (uint32_t)set->max_size; ++i) {
int cluster;
if (symbols[i] == kInvalidHistogramSymbol) continue;
cluster = cluster_mappings[symbols[i]];
@ -1142,7 +1172,7 @@ static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
// Make sure all cluster values are used.
cluster_max = 0;
for (i = 0; i < set->max_size; ++i) {
for (i = 0; i < (uint32_t)set->max_size; ++i) {
if (symbols[i] == kInvalidHistogramSymbol) continue;
if (symbols[i] <= cluster_max) continue;
++cluster_max;
@ -1165,7 +1195,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
int low_effort, int histogram_bits, int cache_bits,
VP8LHistogramSet* const image_histo,
VP8LHistogram* const tmp_histo,
uint16_t* const histogram_symbols,
uint32_t* const histogram_symbols,
const WebPPicture* const pic, int percent_range,
int* const percent) {
const int histo_xsize =
@ -1181,7 +1211,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
int entropy_combine;
uint16_t* const map_tmp =
WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp));
(uint16_t*)WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp));
uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
int num_used = image_histo_raw_size;
if (orig_histo == NULL || map_tmp == NULL) {
@ -1201,7 +1231,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
if (entropy_combine) {
uint16_t* const bin_map = map_tmp;
const float combine_cost_factor =
const int32_t combine_cost_factor =
GetCombineCostFactor(image_histo_raw_size, quality);
const uint32_t num_clusters = num_used;
@ -1217,9 +1247,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
// Don't combine the histograms using stochastic and greedy heuristics for
// low-effort compression mode.
if (!low_effort || !entropy_combine) {
const float x = quality / 100.f;
// cubic ramp between 1 and MAX_HISTO_GREEDY:
const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
const int threshold_size =
(int)(1 + DivRound(quality * quality * quality * (MAX_HISTO_GREEDY - 1),
100 * 100 * 100));
int do_greedy;
if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
&do_greedy)) {

View File

@ -40,10 +40,10 @@ typedef struct {
int palette_code_bits_;
uint32_t trivial_symbol_; // True, if histograms for Red, Blue & Alpha
// literal symbols are single valued.
float bit_cost_; // cached value of bit cost.
float literal_cost_; // Cached values of dominant entropy costs:
float red_cost_; // literal, red & blue.
float blue_cost_;
uint64_t bit_cost_; // cached value of bit cost.
uint64_t literal_cost_; // Cached values of dominant entropy costs:
uint64_t red_cost_; // literal, red & blue.
uint64_t blue_cost_;
uint8_t is_used_[5]; // 5 for literal, red, blue, alpha, distance
} VP8LHistogram;
@ -64,9 +64,6 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
const VP8LBackwardRefs* const refs,
int palette_code_bits);
// Return the size of the histogram for a given cache_bits.
int VP8LGetHistogramSize(int cache_bits);
// Set the palette_code_bits and reset the stats.
// If init_arrays is true, the arrays are also filled with 0's.
void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
@ -112,16 +109,16 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
int low_effort, int histogram_bits, int cache_bits,
VP8LHistogramSet* const image_histo,
VP8LHistogram* const tmp_histo,
uint16_t* const histogram_symbols,
uint32_t* const histogram_symbols,
const WebPPicture* const pic, int percent_range,
int* const percent);
// Returns the entropy for the symbols in the input array.
float VP8LBitsEntropy(const uint32_t* const array, int n);
uint64_t VP8LBitsEntropy(const uint32_t* const array, int n);
// Estimate how many bits the combined entropy of literals and distance
// approximately maps to.
float VP8LHistogramEstimateBits(VP8LHistogram* const p);
uint64_t VP8LHistogramEstimateBits(VP8LHistogram* const p);
#ifdef __cplusplus
}

View File

@ -13,6 +13,7 @@
#include <string.h>
#include "src/dsp/cpu.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
@ -54,7 +55,8 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
InitLeft(it);
}
void VP8IteratorReset(VP8EncIterator* const it) {
// restart a scan
static void VP8IteratorReset(VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
VP8IteratorSetRow(it, 0);
VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_); // default
@ -424,6 +426,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
}
}
#if WEBP_AARCH64 && BPS == 32 && defined(WEBP_MSAN)
// Intra4Preds_NEON() reads 3 uninitialized bytes from i4_boundary_ when top
// is positioned at offset 29 (VP8TopLeftI4[3]). The values are not used
// meaningfully, but due to limitations in MemorySanitizer related to
// modeling of tbl instructions, a warning will be issued. This can be
// removed if MSan is updated to support the instructions. See
// https://issues.webmproject.org/372109644.
memset(it->i4_boundary_ + sizeof(it->i4_boundary_) - 3, 0xaa, 3);
#endif
VP8IteratorNzToBytes(it); // import the non-zero context
}

View File

@ -14,53 +14,75 @@
// Urvang Joshi (urvang@google.com)
// Vincent Rabaud (vrabaud@google.com)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/enc/vp8i_enc.h"
#include "src/enc/vp8li_enc.h"
#include "src/utils/utils.h"
#include "src/webp/encode.h"
#include "src/webp/format_constants.h"
#include "src/webp/types.h"
#define MAX_DIFF_COST (1e30f)
static const float kSpatialPredictorBias = 15.f;
#define HISTO_SIZE (4 * 256)
static const int64_t kSpatialPredictorBias = 15ll << LOG_2_PRECISION_BITS;
static const int kPredLowEffort = 11;
static const uint32_t kMaskAlpha = 0xff000000;
static const int kNumPredModes = 14;
// Mostly used to reduce code size + readability
static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
//------------------------------------------------------------------------------
// Methods to calculate Entropy (Shannon).
static float PredictionCostSpatial(const int counts[256], int weight_0,
float exp_val) {
// Compute a bias for prediction entropy using a global heuristic to favor
// values closer to 0. Hence the final negative sign.
// 'exp_val' has a scaling factor of 1/100.
static int64_t PredictionCostBias(const uint32_t counts[256], uint64_t weight_0,
uint64_t exp_val) {
const int significant_symbols = 256 >> 4;
const float exp_decay_factor = 0.6f;
float bits = (float)weight_0 * counts[0];
const uint64_t exp_decay_factor = 6; // has a scaling factor of 1/10
uint64_t bits = (weight_0 * counts[0]) << LOG_2_PRECISION_BITS;
int i;
exp_val <<= LOG_2_PRECISION_BITS;
for (i = 1; i < significant_symbols; ++i) {
bits += exp_val * (counts[i] + counts[256 - i]);
exp_val *= exp_decay_factor;
bits += DivRound(exp_val * (counts[i] + counts[256 - i]), 100);
exp_val = DivRound(exp_decay_factor * exp_val, 10);
}
return (float)(-0.1 * bits);
return -DivRound((int64_t)bits, 10);
}
static float PredictionCostSpatialHistogram(const int accumulated[4][256],
const int tile[4][256]) {
static int64_t PredictionCostSpatialHistogram(
const uint32_t accumulated[HISTO_SIZE], const uint32_t tile[HISTO_SIZE],
int mode, int left_mode, int above_mode) {
int i;
float retval = 0.f;
int64_t retval = 0;
for (i = 0; i < 4; ++i) {
const float kExpValue = 0.94f;
retval += PredictionCostSpatial(tile[i], 1, kExpValue);
retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
const uint64_t kExpValue = 94;
retval += PredictionCostBias(&tile[i * 256], 1, kExpValue);
// Compute the new cost if 'tile' is added to 'accumulate' but also add the
// cost of the current histogram to guide the spatial predictor selection.
// Basically, favor low entropy, locally and globally.
retval += (int64_t)VP8LCombinedShannonEntropy(&tile[i * 256],
&accumulated[i * 256]);
}
return (float)retval;
// Favor keeping the areas locally similar.
if (mode == left_mode) retval -= kSpatialPredictorBias;
if (mode == above_mode) retval -= kSpatialPredictorBias;
return retval;
}
static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
++histo_argb[0][argb >> 24];
++histo_argb[1][(argb >> 16) & 0xff];
++histo_argb[2][(argb >> 8) & 0xff];
++histo_argb[3][argb & 0xff];
static WEBP_INLINE void UpdateHisto(uint32_t histo_argb[HISTO_SIZE],
uint32_t argb) {
++histo_argb[0 * 256 + (argb >> 24)];
++histo_argb[1 * 256 + ((argb >> 16) & 0xff)];
++histo_argb[2 * 256 + ((argb >> 8) & 0xff)];
++histo_argb[3 * 256 + (argb & 0xff)];
}
//------------------------------------------------------------------------------
@ -91,8 +113,6 @@ static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
}
#if (WEBP_NEAR_LOSSLESS == 1)
static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
@ -291,23 +311,80 @@ static WEBP_INLINE void GetResidual(
}
}
// Returns best predictor and updates the accumulated histogram.
// Accessors to residual histograms.
static WEBP_INLINE uint32_t* GetHistoArgb(uint32_t* const all_histos,
int subsampling_index, int mode) {
return &all_histos[(subsampling_index * kNumPredModes + mode) * HISTO_SIZE];
}
static WEBP_INLINE const uint32_t* GetHistoArgbConst(
const uint32_t* const all_histos, int subsampling_index, int mode) {
return &all_histos[subsampling_index * kNumPredModes * HISTO_SIZE +
mode * HISTO_SIZE];
}
// Accessors to accumulated residual histogram.
static WEBP_INLINE uint32_t* GetAccumulatedHisto(uint32_t* all_accumulated,
int subsampling_index) {
return &all_accumulated[subsampling_index * HISTO_SIZE];
}
// Find and store the best predictor for a tile at subsampling
// 'subsampling_index'.
static void GetBestPredictorForTile(const uint32_t* const all_argb,
int subsampling_index, int tile_x,
int tile_y, int tiles_per_row,
uint32_t* all_accumulated_argb,
uint32_t** const all_modes,
uint32_t* const all_pred_histos) {
uint32_t* const accumulated_argb =
GetAccumulatedHisto(all_accumulated_argb, subsampling_index);
uint32_t* const modes = all_modes[subsampling_index];
uint32_t* const pred_histos =
&all_pred_histos[subsampling_index * kNumPredModes];
// Prediction modes of the left and above neighbor tiles.
const int left_mode =
(tile_x > 0) ? (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff
: 0xff;
const int above_mode =
(tile_y > 0) ? (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff
: 0xff;
int mode;
int64_t best_diff = WEBP_INT64_MAX;
uint32_t best_mode = 0;
const uint32_t* best_histo =
GetHistoArgbConst(all_argb, /*subsampling_index=*/0, best_mode);
for (mode = 0; mode < kNumPredModes; ++mode) {
const uint32_t* const histo_argb =
GetHistoArgbConst(all_argb, subsampling_index, mode);
const int64_t cur_diff = PredictionCostSpatialHistogram(
accumulated_argb, histo_argb, mode, left_mode, above_mode);
if (cur_diff < best_diff) {
best_histo = histo_argb;
best_diff = cur_diff;
best_mode = mode;
}
}
// Update the accumulated histogram.
VP8LAddVectorEq(best_histo, accumulated_argb, HISTO_SIZE);
modes[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (best_mode << 8);
++pred_histos[best_mode];
}
// Computes the residuals for the different predictors.
// If max_quantization > 1, assumes that near lossless processing will be
// applied, quantizing residuals to multiples of quantization levels up to
// max_quantization (the actual quantization level depends on smoothness near
// the given pixel).
static int GetBestPredictorForTile(int width, int height,
int tile_x, int tile_y, int bits,
int accumulated[4][256],
uint32_t* const argb_scratch,
const uint32_t* const argb,
int max_quantization,
int exact, int used_subtract_green,
const uint32_t* const modes) {
const int kNumPredModes = 14;
const int start_x = tile_x << bits;
const int start_y = tile_y << bits;
const int tile_size = 1 << bits;
static void ComputeResidualsForTile(
int width, int height, int tile_x, int tile_y, int min_bits,
uint32_t update_up_to_index, uint32_t* const all_argb,
uint32_t* const argb_scratch, const uint32_t* const argb,
int max_quantization, int exact, int used_subtract_green) {
const int start_x = tile_x << min_bits;
const int start_y = tile_y << min_bits;
const int tile_size = 1 << min_bits;
const int max_y = GetMin(tile_size, height - start_y);
const int max_x = GetMin(tile_size, width - start_x);
// Whether there exist columns just outside the tile.
@ -318,35 +395,20 @@ static int GetBestPredictorForTile(int width, int height,
#if (WEBP_NEAR_LOSSLESS == 1)
const int context_width = max_x + have_left + (max_x < width - start_x);
#endif
const int tiles_per_row = VP8LSubSampleSize(width, bits);
// Prediction modes of the left and above neighbor tiles.
const int left_mode = (tile_x > 0) ?
(modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
const int above_mode = (tile_y > 0) ?
(modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
// The width of upper_row and current_row is one pixel larger than image width
// to allow the top right pixel to point to the leftmost pixel of the next row
// when at the right edge.
uint32_t* upper_row = argb_scratch;
uint32_t* current_row = upper_row + width + 1;
uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
float best_diff = MAX_DIFF_COST;
int best_mode = 0;
int mode;
int histo_stack_1[4][256];
int histo_stack_2[4][256];
// Need pointers to be able to swap arrays.
int (*histo_argb)[256] = histo_stack_1;
int (*best_histo)[256] = histo_stack_2;
int i, j;
uint32_t residuals[1 << MAX_TRANSFORM_BITS];
assert(bits <= MAX_TRANSFORM_BITS);
assert(max_x <= (1 << MAX_TRANSFORM_BITS));
for (mode = 0; mode < kNumPredModes; ++mode) {
float cur_diff;
int relative_y;
memset(histo_argb, 0, sizeof(histo_stack_1));
uint32_t* const histo_argb =
GetHistoArgb(all_argb, /*subsampling_index=*/0, mode);
if (start_y > 0) {
// Read the row above the tile which will become the first upper_row.
// Include a pixel to the left if it exists; include a pixel to the right
@ -382,41 +444,31 @@ static int GetBestPredictorForTile(int width, int height,
for (relative_x = 0; relative_x < max_x; ++relative_x) {
UpdateHisto(histo_argb, residuals[relative_x]);
}
}
cur_diff = PredictionCostSpatialHistogram(
(const int (*)[256])accumulated, (const int (*)[256])histo_argb);
// Favor keeping the areas locally similar.
if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
if (cur_diff < best_diff) {
int (*tmp)[256] = histo_argb;
histo_argb = best_histo;
best_histo = tmp;
best_diff = cur_diff;
best_mode = mode;
if (update_up_to_index > 0) {
uint32_t subsampling_index;
for (subsampling_index = 1; subsampling_index <= update_up_to_index;
++subsampling_index) {
uint32_t* const super_histo =
GetHistoArgb(all_argb, subsampling_index, mode);
for (relative_x = 0; relative_x < max_x; ++relative_x) {
UpdateHisto(super_histo, residuals[relative_x]);
}
}
}
}
}
for (i = 0; i < 4; i++) {
for (j = 0; j < 256; j++) {
accumulated[i][j] += best_histo[i][j];
}
}
return best_mode;
}
// Converts pixels of the image to residuals with respect to predictions.
// If max_quantization > 1, applies near lossless processing, quantizing
// residuals to multiples of quantization levels up to max_quantization
// (the actual quantization level depends on smoothness near the given pixel).
static void CopyImageWithPrediction(int width, int height,
int bits, uint32_t* const modes,
static void CopyImageWithPrediction(int width, int height, int bits,
const uint32_t* const modes,
uint32_t* const argb_scratch,
uint32_t* const argb,
int low_effort, int max_quantization,
int exact, int used_subtract_green) {
uint32_t* const argb, int low_effort,
int max_quantization, int exact,
int used_subtract_green) {
const int tiles_per_row = VP8LSubSampleSize(width, bits);
// The width of upper_row and current_row is one pixel larger than image width
// to allow the top right pixel to point to the leftmost pixel of the next row
@ -469,47 +521,307 @@ static void CopyImageWithPrediction(int width, int height,
}
}
// Checks whether 'image' can be subsampled by finding the biggest power of 2
// squares (defined by 'best_bits') of uniform value it is made out of.
void VP8LOptimizeSampling(uint32_t* const image, int full_width,
int full_height, int bits, int max_bits,
int* best_bits_out) {
int width = VP8LSubSampleSize(full_width, bits);
int height = VP8LSubSampleSize(full_height, bits);
int old_width, x, y, square_size;
int best_bits = bits;
*best_bits_out = bits;
// Check rows first.
while (best_bits < max_bits) {
const int new_square_size = 1 << (best_bits + 1 - bits);
int is_good = 1;
square_size = 1 << (best_bits - bits);
for (y = 0; y + square_size < height; y += new_square_size) {
// Check the first lines of consecutive line groups.
if (memcmp(&image[y * width], &image[(y + square_size) * width],
width * sizeof(*image)) != 0) {
is_good = 0;
break;
}
}
if (is_good) {
++best_bits;
} else {
break;
}
}
if (best_bits == bits) return;
// Check columns.
while (best_bits > bits) {
int is_good = 1;
square_size = 1 << (best_bits - bits);
for (y = 0; is_good && y < height; ++y) {
for (x = 0; is_good && x < width; x += square_size) {
int i;
for (i = x + 1; i < GetMin(x + square_size, width); ++i) {
if (image[y * width + i] != image[y * width + x]) {
is_good = 0;
break;
}
}
}
}
if (is_good) {
break;
}
--best_bits;
}
if (best_bits == bits) return;
// Subsample the image.
old_width = width;
square_size = 1 << (best_bits - bits);
width = VP8LSubSampleSize(full_width, best_bits);
height = VP8LSubSampleSize(full_height, best_bits);
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
image[y * width + x] = image[square_size * (y * old_width + x)];
}
}
*best_bits_out = best_bits;
}
// Computes the best predictor image.
// Finds the best predictors per tile. Once done, finds the best predictor image
// sampling.
// best_bits is set to 0 in case of error.
// The following requires some glossary:
// - a tile is a square of side 2^min_bits pixels.
// - a super-tile of a tile is a square of side 2^bits pixels with bits in
// [min_bits+1, max_bits].
// - the max-tile of a tile is the square of 2^max_bits pixels containing it.
// If this max-tile crosses the border of an image, it is cropped.
// - tile, super-tiles and max_tile are aligned on powers of 2 in the original
// image.
// - coordinates for tile, super-tile, max-tile are respectively named
// tile_x, super_tile_x, max_tile_x at their bit scale.
// - in the max-tile, a tile has local coordinates (local_tile_x, local_tile_y).
// The tiles are processed in the following zigzag order to complete the
// super-tiles as soon as possible:
// 1 2| 5 6
// 3 4| 7 8
// --------------
// 9 10| 13 14
// 11 12| 15 16
// When computing the residuals for a tile, the histogram of the above
// super-tile is updated. If this super-tile is finished, its histogram is used
// to update the histogram of the next super-tile and so on up to the max-tile.
static void GetBestPredictorsAndSubSampling(
int width, int height, const int min_bits, const int max_bits,
uint32_t* const argb_scratch, const uint32_t* const argb,
int max_quantization, int exact, int used_subtract_green,
const WebPPicture* const pic, int percent_range, int* const percent,
uint32_t** const all_modes, int* best_bits, uint32_t** best_mode) {
const uint32_t tiles_per_row = VP8LSubSampleSize(width, min_bits);
const uint32_t tiles_per_col = VP8LSubSampleSize(height, min_bits);
int64_t best_cost;
uint32_t subsampling_index;
const uint32_t max_subsampling_index = max_bits - min_bits;
// Compute the needed memory size for residual histograms, accumulated
// residual histograms and predictor histograms.
const int num_argb = (max_subsampling_index + 1) * kNumPredModes * HISTO_SIZE;
const int num_accumulated_rgb = (max_subsampling_index + 1) * HISTO_SIZE;
const int num_predictors = (max_subsampling_index + 1) * kNumPredModes;
uint32_t* const raw_data = (uint32_t*)WebPSafeCalloc(
num_argb + num_accumulated_rgb + num_predictors, sizeof(uint32_t));
uint32_t* const all_argb = raw_data;
uint32_t* const all_accumulated_argb = all_argb + num_argb;
uint32_t* const all_pred_histos = all_accumulated_argb + num_accumulated_rgb;
const int max_tile_size = 1 << max_subsampling_index; // in tile size
int percent_start = *percent;
// When using the residuals of a tile for its super-tiles, you can either:
// - use each residual to update the histogram of the super-tile, with a cost
// of 4 * (1<<n)^2 increment operations (4 for the number of channels, and
// (1<<n)^2 for the number of pixels in the tile)
// - use the histogram of the tile to update the histogram of the super-tile,
// with a cost of HISTO_SIZE (1024)
// The first method is therefore faster until n==4. 'update_up_to_index'
// defines the maximum subsampling_index for which the residuals should be
// individually added to the super-tile histogram.
const uint32_t update_up_to_index =
GetMax(GetMin(4, max_bits), min_bits) - min_bits;
// Coordinates in the max-tile in tile units.
uint32_t local_tile_x = 0, local_tile_y = 0;
uint32_t max_tile_x = 0, max_tile_y = 0;
uint32_t tile_x = 0, tile_y = 0;
*best_bits = 0;
*best_mode = NULL;
if (raw_data == NULL) return;
while (tile_y < tiles_per_col) {
ComputeResidualsForTile(width, height, tile_x, tile_y, min_bits,
update_up_to_index, all_argb, argb_scratch, argb,
max_quantization, exact, used_subtract_green);
// Update all the super-tiles that are complete.
subsampling_index = 0;
while (1) {
const uint32_t super_tile_x = tile_x >> subsampling_index;
const uint32_t super_tile_y = tile_y >> subsampling_index;
const uint32_t super_tiles_per_row =
VP8LSubSampleSize(width, min_bits + subsampling_index);
GetBestPredictorForTile(all_argb, subsampling_index, super_tile_x,
super_tile_y, super_tiles_per_row,
all_accumulated_argb, all_modes, all_pred_histos);
if (subsampling_index == max_subsampling_index) break;
// Update the following super-tile histogram if it has not been updated
// yet.
++subsampling_index;
if (subsampling_index > update_up_to_index &&
subsampling_index <= max_subsampling_index) {
VP8LAddVectorEq(
GetHistoArgbConst(all_argb, subsampling_index - 1, /*mode=*/0),
GetHistoArgb(all_argb, subsampling_index, /*mode=*/0),
HISTO_SIZE * kNumPredModes);
}
// Check whether the super-tile is not complete (if the smallest tile
// is not at the end of a line/column or at the beginning of a super-tile
// of size (1 << subsampling_index)).
if (!((tile_x == (tiles_per_row - 1) ||
(local_tile_x + 1) % (1 << subsampling_index) == 0) &&
(tile_y == (tiles_per_col - 1) ||
(local_tile_y + 1) % (1 << subsampling_index) == 0))) {
--subsampling_index;
// subsampling_index now is the index of the last finished super-tile.
break;
}
}
// Reset all the histograms belonging to finished tiles.
memset(all_argb, 0,
HISTO_SIZE * kNumPredModes * (subsampling_index + 1) *
sizeof(*all_argb));
if (subsampling_index == max_subsampling_index) {
// If a new max-tile is started.
if (tile_x == (tiles_per_row - 1)) {
max_tile_x = 0;
++max_tile_y;
} else {
++max_tile_x;
}
local_tile_x = 0;
local_tile_y = 0;
} else {
// Proceed with the Z traversal.
uint32_t coord_x = local_tile_x >> subsampling_index;
uint32_t coord_y = local_tile_y >> subsampling_index;
if (tile_x == (tiles_per_row - 1) && coord_x % 2 == 0) {
++coord_y;
} else {
if (coord_x % 2 == 0) {
++coord_x;
} else {
// Z traversal.
++coord_y;
--coord_x;
}
}
local_tile_x = coord_x << subsampling_index;
local_tile_y = coord_y << subsampling_index;
}
tile_x = max_tile_x * max_tile_size + local_tile_x;
tile_y = max_tile_y * max_tile_size + local_tile_y;
if (tile_x == 0 &&
!WebPReportProgress(
pic, percent_start + percent_range * tile_y / tiles_per_col,
percent)) {
WebPSafeFree(raw_data);
return;
}
}
// Figure out the best sampling.
best_cost = WEBP_INT64_MAX;
for (subsampling_index = 0; subsampling_index <= max_subsampling_index;
++subsampling_index) {
int plane;
const uint32_t* const accumulated =
GetAccumulatedHisto(all_accumulated_argb, subsampling_index);
int64_t cost = VP8LShannonEntropy(
&all_pred_histos[subsampling_index * kNumPredModes], kNumPredModes);
for (plane = 0; plane < 4; ++plane) {
cost += VP8LShannonEntropy(&accumulated[plane * 256], 256);
}
if (cost < best_cost) {
best_cost = cost;
*best_bits = min_bits + subsampling_index;
*best_mode = all_modes[subsampling_index];
}
}
WebPSafeFree(raw_data);
VP8LOptimizeSampling(*best_mode, width, height, *best_bits,
MAX_TRANSFORM_BITS, best_bits);
}
// Finds the best predictor for each tile, and converts the image to residuals
// with respect to predictions. If near_lossless_quality < 100, applies
// near lossless processing, shaving off more bits of residuals for lower
// qualities.
int VP8LResidualImage(int width, int height, int bits, int low_effort,
uint32_t* const argb, uint32_t* const argb_scratch,
uint32_t* const image, int near_lossless_quality,
int exact, int used_subtract_green,
const WebPPicture* const pic, int percent_range,
int* const percent) {
const int tiles_per_row = VP8LSubSampleSize(width, bits);
const int tiles_per_col = VP8LSubSampleSize(height, bits);
int VP8LResidualImage(int width, int height, int min_bits, int max_bits,
int low_effort, uint32_t* const argb,
uint32_t* const argb_scratch, uint32_t* const image,
int near_lossless_quality, int exact,
int used_subtract_green, const WebPPicture* const pic,
int percent_range, int* const percent,
int* const best_bits) {
int percent_start = *percent;
int tile_y;
int histo[4][256];
const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
if (low_effort) {
const int tiles_per_row = VP8LSubSampleSize(width, max_bits);
const int tiles_per_col = VP8LSubSampleSize(height, max_bits);
int i;
for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
image[i] = ARGB_BLACK | (kPredLowEffort << 8);
}
*best_bits = max_bits;
} else {
memset(histo, 0, sizeof(histo));
for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
int tile_x;
for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
const int pred = GetBestPredictorForTile(
width, height, tile_x, tile_y, bits, histo, argb_scratch, argb,
max_quantization, exact, used_subtract_green, image);
image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
}
if (!WebPReportProgress(
pic, percent_start + percent_range * tile_y / tiles_per_col,
percent)) {
return 0;
}
// Allocate data to try all samplings from min_bits to max_bits.
int bits;
uint32_t sum_num_pixels = 0u;
uint32_t *modes_raw, *best_mode;
uint32_t* modes[MAX_TRANSFORM_BITS + 1];
uint32_t num_pixels[MAX_TRANSFORM_BITS + 1];
for (bits = min_bits; bits <= max_bits; ++bits) {
const int tiles_per_row = VP8LSubSampleSize(width, bits);
const int tiles_per_col = VP8LSubSampleSize(height, bits);
num_pixels[bits] = tiles_per_row * tiles_per_col;
sum_num_pixels += num_pixels[bits];
}
modes_raw = (uint32_t*)WebPSafeMalloc(sum_num_pixels, sizeof(*modes_raw));
if (modes_raw == NULL) return 0;
// Have modes point to the right global memory modes_raw.
modes[min_bits] = modes_raw;
for (bits = min_bits + 1; bits <= max_bits; ++bits) {
modes[bits] = modes[bits - 1] + num_pixels[bits - 1];
}
// Find the best sampling.
GetBestPredictorsAndSubSampling(
width, height, min_bits, max_bits, argb_scratch, argb, max_quantization,
exact, used_subtract_green, pic, percent_range, percent,
&modes[min_bits], best_bits, &best_mode);
if (*best_bits == 0) {
WebPSafeFree(modes_raw);
return 0;
}
// Keep the best predictor image.
memcpy(image, best_mode,
VP8LSubSampleSize(width, *best_bits) *
VP8LSubSampleSize(height, *best_bits) * sizeof(*image));
WebPSafeFree(modes_raw);
}
CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
CopyImageWithPrediction(width, height, *best_bits, image, argb_scratch, argb,
low_effort, max_quantization, exact,
used_subtract_green);
return WebPReportProgress(pic, percent_start + percent_range, percent);
@ -539,48 +851,51 @@ static WEBP_INLINE uint32_t MultipliersToColorCode(
m->green_to_red_;
}
static float PredictionCostCrossColor(const int accumulated[256],
const int counts[256]) {
static int64_t PredictionCostCrossColor(const uint32_t accumulated[256],
const uint32_t counts[256]) {
// Favor low entropy, locally and globally.
// Favor small absolute values for PredictionCostSpatial
static const float kExpValue = 2.4f;
return VP8LCombinedShannonEntropy(counts, accumulated) +
PredictionCostSpatial(counts, 3, kExpValue);
static const uint64_t kExpValue = 240;
return (int64_t)VP8LCombinedShannonEntropy(counts, accumulated) +
PredictionCostBias(counts, 3, kExpValue);
}
static float GetPredictionCostCrossColorRed(
static int64_t GetPredictionCostCrossColorRed(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
const int accumulated_red_histo[256]) {
int histo[256] = { 0 };
float cur_diff;
const uint32_t accumulated_red_histo[256]) {
uint32_t histo[256] = { 0 };
int64_t cur_diff;
VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
green_to_red, histo);
cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
if ((uint8_t)green_to_red == prev_x.green_to_red_) {
cur_diff -= 3; // favor keeping the areas locally similar
// favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if ((uint8_t)green_to_red == prev_y.green_to_red_) {
cur_diff -= 3; // favor keeping the areas locally similar
// favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if (green_to_red == 0) {
cur_diff -= 3;
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
return cur_diff;
}
static void GetBestGreenToRed(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
static void GetBestGreenToRed(const uint32_t* argb, int stride, int tile_width,
int tile_height, VP8LMultipliers prev_x,
VP8LMultipliers prev_y, int quality,
const uint32_t accumulated_red_histo[256],
VP8LMultipliers* const best_tx) {
const int kMaxIters = 4 + ((7 * quality) >> 8); // in range [4..6]
int green_to_red_best = 0;
int iter, offset;
float best_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_red_best, accumulated_red_histo);
int64_t best_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y, green_to_red_best,
accumulated_red_histo);
for (iter = 0; iter < kMaxIters; ++iter) {
// ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
// one in color computation. Having initial delta here as 1 is sufficient
@ -589,7 +904,7 @@ static void GetBestGreenToRed(
// Try a negative and a positive delta from the best known value.
for (offset = -delta; offset <= delta; offset += 2 * delta) {
const int green_to_red_cur = offset + green_to_red_best;
const float cur_diff = GetPredictionCostCrossColorRed(
const int64_t cur_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_red_cur, accumulated_red_histo);
if (cur_diff < best_diff) {
@ -601,45 +916,50 @@ static void GetBestGreenToRed(
best_tx->green_to_red_ = (green_to_red_best & 0xff);
}
static float GetPredictionCostCrossColorBlue(
static int64_t GetPredictionCostCrossColorBlue(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y,
int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
int histo[256] = { 0 };
float cur_diff;
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_blue,
int red_to_blue, const uint32_t accumulated_blue_histo[256]) {
uint32_t histo[256] = { 0 };
int64_t cur_diff;
VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
green_to_blue, red_to_blue, histo);
cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
// favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
// favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
// favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
// favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if (green_to_blue == 0) {
cur_diff -= 3;
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
if (red_to_blue == 0) {
cur_diff -= 3;
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
}
return cur_diff;
}
#define kGreenRedToBlueNumAxis 8
#define kGreenRedToBlueMaxIters 7
static void GetBestGreenRedToBlue(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
const int accumulated_blue_histo[256],
VP8LMultipliers* const best_tx) {
static void GetBestGreenRedToBlue(const uint32_t* argb, int stride,
int tile_width, int tile_height,
VP8LMultipliers prev_x,
VP8LMultipliers prev_y, int quality,
const uint32_t accumulated_blue_histo[256],
VP8LMultipliers* const best_tx) {
const int8_t offset[kGreenRedToBlueNumAxis][2] =
{{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
@ -649,9 +969,9 @@ static void GetBestGreenRedToBlue(
int red_to_blue_best = 0;
int iter;
// Initial value at origin:
float best_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
int64_t best_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y, green_to_blue_best,
red_to_blue_best, accumulated_blue_histo);
for (iter = 0; iter < iters; ++iter) {
const int delta = delta_lut[iter];
int axis;
@ -659,7 +979,7 @@ static void GetBestGreenRedToBlue(
const int green_to_blue_cur =
offset[axis][0] * delta + green_to_blue_best;
const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
const float cur_diff = GetPredictionCostCrossColorBlue(
const int64_t cur_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
if (cur_diff < best_diff) {
@ -684,13 +1004,10 @@ static void GetBestGreenRedToBlue(
#undef kGreenRedToBlueNumAxis
static VP8LMultipliers GetBestColorTransformForTile(
int tile_x, int tile_y, int bits,
VP8LMultipliers prev_x,
VP8LMultipliers prev_y,
int quality, int xsize, int ysize,
const int accumulated_red_histo[256],
const int accumulated_blue_histo[256],
const uint32_t* const argb) {
int tile_x, int tile_y, int bits, VP8LMultipliers prev_x,
VP8LMultipliers prev_y, int quality, int xsize, int ysize,
const uint32_t accumulated_red_histo[256],
const uint32_t accumulated_blue_histo[256], const uint32_t* const argb) {
const int max_tile_size = 1 << bits;
const int tile_y_offset = tile_y * max_tile_size;
const int tile_x_offset = tile_x * max_tile_size;
@ -728,13 +1045,13 @@ static void CopyTileWithColorTransform(int xsize, int ysize,
int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
uint32_t* const argb, uint32_t* image,
const WebPPicture* const pic, int percent_range,
int* const percent) {
int* const percent, int* const best_bits) {
const int max_tile_size = 1 << bits;
const int tile_xsize = VP8LSubSampleSize(width, bits);
const int tile_ysize = VP8LSubSampleSize(height, bits);
int percent_start = *percent;
int accumulated_red_histo[256] = { 0 };
int accumulated_blue_histo[256] = { 0 };
uint32_t accumulated_red_histo[256] = { 0 };
uint32_t accumulated_blue_histo[256] = { 0 };
int tile_x, tile_y;
VP8LMultipliers prev_x, prev_y;
MultipliersClear(&prev_y);
@ -788,5 +1105,7 @@ int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
return 0;
}
}
VP8LOptimizeSampling(image, width, height, bits, MAX_TRANSFORM_BITS,
best_bits);
return 1;
}

View File

@ -462,7 +462,7 @@ const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
};
@ -478,7 +478,9 @@ void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
VP8EncPredChroma8(it->yuv_p_, left, top);
}
void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
// Form all the ten Intra4x4 predictions in the yuv_p_ cache
// for the 4x4 block it->i4_
static void MakeIntra4Preds(const VP8EncIterator* const it) {
VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
}
@ -1102,7 +1104,7 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
InitScore(&rd_i4);
VP8MakeIntra4Preds(it);
MakeIntra4Preds(it);
for (mode = 0; mode < NUM_BMODES; ++mode) {
VP8ModeScore rd_tmp;
int16_t tmp_levels[16];
@ -1237,7 +1239,7 @@ static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
VP8MakeIntra4Preds(it);
MakeIntra4Preds(it);
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
src, dst, mode) << it->i4_;
} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
@ -1305,7 +1307,7 @@ static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
VP8MakeIntra4Preds(it);
MakeIntra4Preds(it);
for (mode = 0; mode < NUM_BMODES; ++mode) {
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT

View File

@ -16,6 +16,7 @@
#include <string.h> // for memcpy()
#include "src/dec/common_dec.h"
#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h"
#include "src/utils/bit_writer_utils.h"
#include "src/utils/thread_utils.h"
@ -31,7 +32,7 @@ extern "C" {
// version numbers
#define ENC_MAJ_VERSION 1
#define ENC_MIN_VERSION 4
#define ENC_MIN_VERSION 5
#define ENC_REV_VERSION 0
enum { MAX_LF_LEVELS = 64, // Maximum loop filter level
@ -78,7 +79,6 @@ typedef enum { // Rate-distortion optimization levels
extern const uint16_t VP8Scan[16];
extern const uint16_t VP8UVModeOffsets[4];
extern const uint16_t VP8I16ModeOffsets[4];
extern const uint16_t VP8I4ModeOffsets[NUM_BMODES];
// Layout of prediction blocks
// intra 16x16
@ -234,7 +234,11 @@ typedef struct {
VP8BitWriter* bw_; // current bit-writer
uint8_t* preds_; // intra mode predictors (4x4 blocks)
uint32_t* nz_; // non-zero pattern
#if WEBP_AARCH64 && BPS == 32
uint8_t i4_boundary_[40]; // 32+8 boundary samples needed by intra4x4
#else
uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4
#endif
uint8_t* i4_top_; // pointer to the current top boundary sample
int i4_; // current intra4x4 mode being tested
int top_nz_[9]; // top-non-zero context.
@ -267,8 +271,6 @@ typedef struct {
// in iterator.c
// must be called first
void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
// restart a scan
void VP8IteratorReset(VP8EncIterator* const it);
// reset iterator position to row 'y'
void VP8IteratorSetRow(VP8EncIterator* const it, int y);
// set count down (=number of iterations to go)
@ -444,9 +446,6 @@ extern const uint8_t VP8Cat6[];
void VP8MakeLuma16Preds(const VP8EncIterator* const it);
// Form all the four Chroma8x8 predictions in the yuv_p_ cache
void VP8MakeChroma8Preds(const VP8EncIterator* const it);
// Form all the ten Intra4x4 predictions in the yuv_p_ cache
// for the 4x4 block it->i4_
void VP8MakeIntra4Preds(const VP8EncIterator* const it);
// Rate calculation
int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);

View File

@ -30,6 +30,10 @@
// Maximum number of histogram images (sub-blocks).
#define MAX_HUFF_IMAGE_SIZE 2600
#define MAX_HUFFMAN_BITS (MIN_HUFFMAN_BITS + (1 << NUM_HUFFMAN_BITS) - 1)
// Empirical value for which it becomes too computationally expensive to
// compute the best predictor image.
#define MAX_PREDICTOR_IMAGE_SIZE (1 << 14)
// -----------------------------------------------------------------------------
// Palette
@ -140,8 +144,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
curr_row += argb_stride;
}
{
float entropy_comp[kHistoTotal];
float entropy[kNumEntropyIx];
uint64_t entropy_comp[kHistoTotal];
uint64_t entropy[kNumEntropyIx];
int k;
int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
int j;
@ -179,19 +183,19 @@ static int AnalyzeEntropy(const uint32_t* argb,
// When including transforms, there is an overhead in bits from
// storing them. This overhead is small but matters for small images.
// For spatial, there are 14 transformations.
entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
entropy[kSpatial] += (uint64_t)VP8LSubSampleSize(width, transform_bits) *
VP8LSubSampleSize(height, transform_bits) *
VP8LFastLog2(14);
// For color transforms: 24 as only 3 channels are considered in a
// ColorTransformElement.
entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
VP8LSubSampleSize(height, transform_bits) *
VP8LFastLog2(24);
entropy[kSpatialSubGreen] +=
(uint64_t)VP8LSubSampleSize(width, transform_bits) *
VP8LSubSampleSize(height, transform_bits) * VP8LFastLog2(24);
// For palettes, add the cost of storing the palette.
// We empirically estimate the cost of a compressed entry as 8 bits.
// The palette is differential-coded when compressed hence a much
// lower cost than sizeof(uint32_t)*8.
entropy[kPalette] += palette_size * 8;
entropy[kPalette] += (palette_size * 8ull) << LOG_2_PRECISION_BITS;
*min_entropy_ix = kDirect;
for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
@ -231,17 +235,33 @@ static int AnalyzeEntropy(const uint32_t* argb,
}
}
// Clamp histogram and transform bits.
static int ClampBits(int width, int height, int bits, int min_bits,
int max_bits, int image_size_max) {
int image_size;
bits = (bits < min_bits) ? min_bits : (bits > max_bits) ? max_bits : bits;
image_size = VP8LSubSampleSize(width, bits) * VP8LSubSampleSize(height, bits);
while (bits < max_bits && image_size > image_size_max) {
++bits;
image_size =
VP8LSubSampleSize(width, bits) * VP8LSubSampleSize(height, bits);
}
// In case the bits reduce the image too much, choose the smallest value
// setting the histogram image size to 1.
while (bits > min_bits && image_size == 1) {
image_size = VP8LSubSampleSize(width, bits - 1) *
VP8LSubSampleSize(height, bits - 1);
if (image_size != 1) break;
--bits;
}
return bits;
}
static int GetHistoBits(int method, int use_palette, int width, int height) {
// Make tile size a function of encoding method (Range: 0 to 6).
int histo_bits = (use_palette ? 9 : 7) - method;
while (1) {
const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
VP8LSubSampleSize(height, histo_bits);
if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
++histo_bits;
}
return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
(histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
const int histo_bits = (use_palette ? 9 : 7) - method;
return ClampBits(width, height, histo_bits, MIN_HUFFMAN_BITS,
MAX_HUFFMAN_BITS, MAX_HUFF_IMAGE_SIZE);
}
static int GetTransformBits(int method, int histo_bits) {
@ -280,7 +300,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
const int method = config->method;
const int low_effort = (config->method == 0);
int i;
int use_palette;
int use_palette, transform_bits;
int n_lz77s;
// If set to 0, analyze the cache with the computed cache value. If 1, also
// analyze with no-cache.
@ -297,7 +317,9 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
// Empirical bit sizes.
enc->histo_bits_ = GetHistoBits(method, use_palette,
pic->width, pic->height);
enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
transform_bits = GetTransformBits(method, enc->histo_bits_);
enc->predictor_transform_bits_ = transform_bits;
enc->cross_color_transform_bits_ = transform_bits;
if (low_effort) {
// AnalyzeEntropy is somewhat slow.
@ -311,8 +333,8 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
// Try out multiple LZ77 on images with few colors.
n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
enc->palette_size_, enc->transform_bits_,
&min_entropy_ix, red_and_blue_always_zero)) {
enc->palette_size_, transform_bits, &min_entropy_ix,
red_and_blue_always_zero)) {
return 0;
}
if (method == 6 && config->quality == 100) {
@ -661,11 +683,12 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
}
static int StoreImageToBitMask(
VP8LBitWriter* const bw, int width, int histo_bits,
const VP8LBackwardRefs* const refs,
const uint16_t* histogram_symbols,
const HuffmanTreeCode* const huffman_codes, const WebPPicture* const pic) {
static int StoreImageToBitMask(VP8LBitWriter* const bw, int width,
int histo_bits,
const VP8LBackwardRefs* const refs,
const uint32_t* histogram_symbols,
const HuffmanTreeCode* const huffman_codes,
const WebPPicture* const pic) {
const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
// x and y trace the position in the image.
@ -673,7 +696,7 @@ static int StoreImageToBitMask(
int y = 0;
int tile_x = x & tile_mask;
int tile_y = y & tile_mask;
int histogram_ix = histogram_symbols[0];
int histogram_ix = (histogram_symbols[0] >> 8) & 0xffff;
const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
while (VP8LRefsCursorOk(&c)) {
@ -681,8 +704,10 @@ static int StoreImageToBitMask(
if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
tile_x = x & tile_mask;
tile_y = y & tile_mask;
histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
(x >> histo_bits)];
histogram_ix = (histogram_symbols[(y >> histo_bits) * histo_xsize +
(x >> histo_bits)] >>
8) &
0xffff;
codes = huffman_codes + 5 * histogram_ix;
}
if (PixOrCopyIsLiteral(v)) {
@ -738,7 +763,7 @@ static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
VP8LBackwardRefs* refs;
HuffmanTreeToken* tokens = NULL;
HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}};
const uint16_t histogram_symbols[1] = {0}; // only one tree, one symbol
const uint32_t histogram_symbols[1] = {0}; // only one tree, one symbol
int cache_bits = 0;
VP8LHistogramSet* histogram_image = NULL;
HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
@ -821,32 +846,32 @@ static int EncodeImageInternal(
VP8LBitWriter* const bw, const uint32_t* const argb,
VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
int height, int quality, int low_effort, const CrunchConfig* const config,
int* cache_bits, int histogram_bits, size_t init_byte_position,
int* cache_bits, int histogram_bits_in, size_t init_byte_position,
int* const hdr_size, int* const data_size, const WebPPicture* const pic,
int percent_range, int* const percent) {
const uint32_t histogram_image_xysize =
VP8LSubSampleSize(width, histogram_bits) *
VP8LSubSampleSize(height, histogram_bits);
VP8LSubSampleSize(width, histogram_bits_in) *
VP8LSubSampleSize(height, histogram_bits_in);
int remaining_percent = percent_range;
int percent_start = *percent;
VP8LHistogramSet* histogram_image = NULL;
VP8LHistogram* tmp_histo = NULL;
int histogram_image_size = 0;
uint32_t i, histogram_image_size = 0;
size_t bit_array_size = 0;
HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
HuffmanTreeToken* tokens = NULL;
HuffmanTreeCode* huffman_codes = NULL;
uint16_t* const histogram_symbols = (uint16_t*)WebPSafeMalloc(
histogram_image_xysize, sizeof(*histogram_symbols));
uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
histogram_image_xysize, sizeof(*histogram_argb));
int sub_configs_idx;
int cache_bits_init, write_histogram_image;
VP8LBitWriter bw_init = *bw, bw_best;
int hdr_size_tmp;
VP8LHashChain hash_chain_histogram; // histogram image hash chain
size_t bw_size_best = ~(size_t)0;
assert(histogram_bits >= MIN_HUFFMAN_BITS);
assert(histogram_bits <= MAX_HUFFMAN_BITS);
assert(histogram_bits_in >= MIN_HUFFMAN_BITS);
assert(histogram_bits_in <= MAX_HUFFMAN_BITS);
assert(hdr_size != NULL);
assert(data_size != NULL);
@ -857,7 +882,7 @@ static int EncodeImageInternal(
}
// Make sure we can allocate the different objects.
if (huff_tree == NULL || histogram_symbols == NULL ||
if (huff_tree == NULL || histogram_argb == NULL ||
!VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) {
WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
goto Error;
@ -899,6 +924,7 @@ static int EncodeImageInternal(
for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
int histogram_bits = histogram_bits_in;
// Speed-up: no need to study the no-cache case if it was already studied
// in i_cache == 0.
if (i_cache == 1 && cache_bits_best == 0) break;
@ -920,7 +946,7 @@ static int EncodeImageInternal(
if (!VP8LGetHistoImageSymbols(
width, height, &refs_array[i_cache], quality, low_effort,
histogram_bits, cache_bits_tmp, histogram_image, tmp_histo,
histogram_symbols, pic, i_percent_range, percent)) {
histogram_argb, pic, i_percent_range, percent)) {
goto Error;
}
// Create Huffman bit lengths and codes for each histogram image.
@ -953,26 +979,19 @@ static int EncodeImageInternal(
}
// Huffman image + meta huffman.
histogram_image_size = 0;
for (i = 0; i < histogram_image_xysize; ++i) {
if (histogram_argb[i] >= histogram_image_size) {
histogram_image_size = histogram_argb[i] + 1;
}
histogram_argb[i] <<= 8;
}
write_histogram_image = (histogram_image_size > 1);
VP8LPutBits(bw, write_histogram_image, 1);
if (write_histogram_image) {
uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
histogram_image_xysize, sizeof(*histogram_argb));
int max_index = 0;
uint32_t i;
if (histogram_argb == NULL) {
WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
goto Error;
}
for (i = 0; i < histogram_image_xysize; ++i) {
const int symbol_index = histogram_symbols[i] & 0xffff;
histogram_argb[i] = (symbol_index << 8);
if (symbol_index >= max_index) {
max_index = symbol_index + 1;
}
}
histogram_image_size = max_index;
VP8LOptimizeSampling(histogram_argb, width, height, histogram_bits_in,
MAX_HUFFMAN_BITS, &histogram_bits);
VP8LPutBits(bw, histogram_bits - 2, 3);
i_percent_range = i_remaining_percent / 2;
i_remaining_percent -= i_percent_range;
@ -981,15 +1000,12 @@ static int EncodeImageInternal(
VP8LSubSampleSize(width, histogram_bits),
VP8LSubSampleSize(height, histogram_bits), quality, low_effort,
pic, i_percent_range, percent)) {
WebPSafeFree(histogram_argb);
goto Error;
}
WebPSafeFree(histogram_argb);
}
// Store Huffman codes.
{
int i;
int max_tokens = 0;
// Find maximum number of symbols for the huffman tree-set.
for (i = 0; i < 5 * histogram_image_size; ++i) {
@ -1012,7 +1028,7 @@ static int EncodeImageInternal(
// Store actual literals.
hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
histogram_symbols, huffman_codes, pic)) {
histogram_argb, huffman_codes, pic)) {
goto Error;
}
// Keep track of the smallest image so far.
@ -1049,7 +1065,7 @@ static int EncodeImageInternal(
WebPSafeFree(huffman_codes->codes);
WebPSafeFree(huffman_codes);
}
WebPSafeFree(histogram_symbols);
WebPSafeFree(histogram_argb);
VP8LBitWriterWipeOut(&bw_best);
return (pic->error_code == VP8_ENC_OK);
}
@ -1064,54 +1080,60 @@ static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
}
static int ApplyPredictFilter(const VP8LEncoder* const enc, int width,
int height, int quality, int low_effort,
static int ApplyPredictFilter(VP8LEncoder* const enc, int width, int height,
int quality, int low_effort,
int used_subtract_green, VP8LBitWriter* const bw,
int percent_range, int* const percent) {
const int pred_bits = enc->transform_bits_;
const int transform_width = VP8LSubSampleSize(width, pred_bits);
const int transform_height = VP8LSubSampleSize(height, pred_bits);
// we disable near-lossless quantization if palette is used.
int best_bits;
const int near_lossless_strength =
enc->use_palette_ ? 100 : enc->config_->near_lossless;
const int max_bits = ClampBits(width, height, enc->predictor_transform_bits_,
MIN_TRANSFORM_BITS, MAX_TRANSFORM_BITS,
MAX_PREDICTOR_IMAGE_SIZE);
const int min_bits = ClampBits(
width, height,
max_bits - 2 * (enc->config_->method > 4 ? enc->config_->method - 4 : 0),
MIN_TRANSFORM_BITS, MAX_TRANSFORM_BITS, MAX_PREDICTOR_IMAGE_SIZE);
if (!VP8LResidualImage(
width, height, pred_bits, low_effort, enc->argb_, enc->argb_scratch_,
enc->transform_data_, near_lossless_strength, enc->config_->exact,
used_subtract_green, enc->pic_, percent_range / 2, percent)) {
if (!VP8LResidualImage(width, height, min_bits, max_bits, low_effort,
enc->argb_, enc->argb_scratch_, enc->transform_data_,
near_lossless_strength, enc->config_->exact,
used_subtract_green, enc->pic_, percent_range / 2,
percent, &best_bits)) {
return 0;
}
VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
assert(pred_bits >= 2);
VP8LPutBits(bw, pred_bits - 2, 3);
assert(best_bits >= MIN_TRANSFORM_BITS && best_bits <= MAX_TRANSFORM_BITS);
VP8LPutBits(bw, best_bits - MIN_TRANSFORM_BITS, NUM_TRANSFORM_BITS);
enc->predictor_transform_bits_ = best_bits;
return EncodeImageNoHuffman(
bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
(VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
bw, enc->transform_data_, &enc->hash_chain_, &enc->refs_[0],
VP8LSubSampleSize(width, best_bits), VP8LSubSampleSize(height, best_bits),
quality, low_effort, enc->pic_, percent_range - percent_range / 2,
percent);
}
static int ApplyCrossColorFilter(const VP8LEncoder* const enc, int width,
int height, int quality, int low_effort,
static int ApplyCrossColorFilter(VP8LEncoder* const enc, int width, int height,
int quality, int low_effort,
VP8LBitWriter* const bw, int percent_range,
int* const percent) {
const int ccolor_transform_bits = enc->transform_bits_;
const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
const int min_bits = enc->cross_color_transform_bits_;
int best_bits;
if (!VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
enc->argb_, enc->transform_data_, enc->pic_,
percent_range / 2, percent)) {
if (!VP8LColorSpaceTransform(width, height, min_bits, quality, enc->argb_,
enc->transform_data_, enc->pic_,
percent_range / 2, percent, &best_bits)) {
return 0;
}
VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
assert(ccolor_transform_bits >= 2);
VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
assert(best_bits >= MIN_TRANSFORM_BITS && best_bits <= MAX_TRANSFORM_BITS);
VP8LPutBits(bw, best_bits - MIN_TRANSFORM_BITS, NUM_TRANSFORM_BITS);
enc->cross_color_transform_bits_ = best_bits;
return EncodeImageNoHuffman(
bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
(VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
bw, enc->transform_data_, &enc->hash_chain_, &enc->refs_[0],
VP8LSubSampleSize(width, best_bits), VP8LSubSampleSize(height, best_bits),
quality, low_effort, enc->pic_, percent_range - percent_range / 2,
percent);
}
@ -1199,8 +1221,8 @@ static int AllocateTransformBuffer(VP8LEncoder* const enc, int width,
: 0;
const uint64_t transform_data_size =
(enc->use_predict_ || enc->use_cross_color_)
? (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
VP8LSubSampleSize(height, enc->transform_bits_)
? (uint64_t)VP8LSubSampleSize(width, MIN_TRANSFORM_BITS) *
VP8LSubSampleSize(height, MIN_TRANSFORM_BITS)
: 0;
const uint64_t max_alignment_in_words =
(WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
@ -1374,13 +1396,11 @@ static int ApplyPalette(const uint32_t* src, uint32_t src_stride, uint32_t* dst,
#undef APPLY_PALETTE_GREEDY_MAX
// Note: Expects "enc->palette_" to be set properly.
static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
static int MapImageFromPalette(VP8LEncoder* const enc) {
const WebPPicture* const pic = enc->pic_;
const int width = pic->width;
const int height = pic->height;
const uint32_t* const palette = enc->palette_;
const uint32_t* src = in_place ? enc->argb_ : pic->argb;
const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
const int palette_size = enc->palette_size_;
int xbits;
@ -1395,9 +1415,9 @@ static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) {
return 0;
}
if (!ApplyPalette(src, src_stride,
enc->argb_, enc->current_width_,
palette, palette_size, width, height, xbits, pic)) {
if (!ApplyPalette(pic->argb, pic->argb_stride, enc->argb_,
enc->current_width_, palette, palette_size, width, height,
xbits, pic)) {
return 0;
}
enc->argb_content_ = kEncoderPalette;
@ -1405,24 +1425,31 @@ static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
}
// Save palette_[] to bitstream.
static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
VP8LEncoder* const enc,
int percent_range, int* const percent) {
static int EncodePalette(VP8LBitWriter* const bw, int low_effort,
VP8LEncoder* const enc, int percent_range,
int* const percent) {
int i;
uint32_t tmp_palette[MAX_PALETTE_SIZE];
const int palette_size = enc->palette_size_;
const uint32_t* const palette = enc->palette_;
// If the last element is 0, do not store it and count on automatic palette
// 0-filling. This can only happen if there is no pixel packing, hence if
// there are strictly more than 16 colors (after 0 is removed).
const uint32_t encoded_palette_size =
(enc->palette_[palette_size - 1] == 0 && palette_size > 17)
? palette_size - 1
: palette_size;
VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
VP8LPutBits(bw, palette_size - 1, 8);
for (i = palette_size - 1; i >= 1; --i) {
VP8LPutBits(bw, encoded_palette_size - 1, 8);
for (i = encoded_palette_size - 1; i >= 1; --i) {
tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
}
tmp_palette[0] = palette[0];
return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
&enc->refs_[0], palette_size, 1, /*quality=*/20,
low_effort, enc->pic_, percent_range, percent);
return EncodeImageNoHuffman(
bw, tmp_palette, &enc->hash_chain_, &enc->refs_[0], encoded_palette_size,
1, /*quality=*/20, low_effort, enc->pic_, percent_range, percent);
}
// -----------------------------------------------------------------------------
@ -1493,7 +1520,6 @@ static int EncodeStreamHook(void* input, void* data2) {
#endif
int hdr_size = 0;
int data_size = 0;
int use_delta_palette = 0;
int idx;
size_t best_size = ~(size_t)0;
VP8LBitWriter bw_init = *bw, bw_best;
@ -1558,45 +1584,43 @@ static int EncodeStreamHook(void* input, void* data2) {
goto Error;
}
remaining_percent -= percent_range;
if (!MapImageFromPalette(enc, use_delta_palette)) goto Error;
if (!MapImageFromPalette(enc)) goto Error;
// If using a color cache, do not have it bigger than the number of
// colors.
if (enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
}
}
if (!use_delta_palette) {
// In case image is not packed.
if (enc->argb_content_ != kEncoderNearLossless &&
enc->argb_content_ != kEncoderPalette) {
if (!MakeInputImageCopy(enc)) goto Error;
}
// In case image is not packed.
if (enc->argb_content_ != kEncoderNearLossless &&
enc->argb_content_ != kEncoderPalette) {
if (!MakeInputImageCopy(enc)) goto Error;
}
// -----------------------------------------------------------------------
// Apply transforms and write transform data.
// -------------------------------------------------------------------------
// Apply transforms and write transform data.
if (enc->use_subtract_green_) {
ApplySubtractGreen(enc, enc->current_width_, height, bw);
}
if (enc->use_subtract_green_) {
ApplySubtractGreen(enc, enc->current_width_, height, bw);
}
if (enc->use_predict_) {
percent_range = remaining_percent / 3;
if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
low_effort, enc->use_subtract_green_, bw,
percent_range, &percent)) {
goto Error;
}
remaining_percent -= percent_range;
if (enc->use_predict_) {
percent_range = remaining_percent / 3;
if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
low_effort, enc->use_subtract_green_, bw,
percent_range, &percent)) {
goto Error;
}
remaining_percent -= percent_range;
}
if (enc->use_cross_color_) {
percent_range = remaining_percent / 2;
if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
low_effort, bw, percent_range, &percent)) {
goto Error;
}
remaining_percent -= percent_range;
if (enc->use_cross_color_) {
percent_range = remaining_percent / 2;
if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
low_effort, bw, percent_range, &percent)) {
goto Error;
}
remaining_percent -= percent_range;
}
VP8LPutBits(bw, !TRANSFORM_PRESENT, 1); // No more transforms.
@ -1625,7 +1649,8 @@ static int EncodeStreamHook(void* input, void* data2) {
if (enc->use_subtract_green_) stats->lossless_features |= 4;
if (enc->use_palette_) stats->lossless_features |= 8;
stats->histogram_bits = enc->histo_bits_;
stats->transform_bits = enc->transform_bits_;
stats->transform_bits = enc->predictor_transform_bits_;
stats->cross_color_transform_bits = enc->cross_color_transform_bits_;
stats->cache_bits = enc->cache_bits_;
stats->palette_size = enc->palette_size_;
stats->lossless_size = (int)(best_size - byte_position);
@ -1735,7 +1760,10 @@ int VP8LEncodeStream(const WebPConfig* const config,
}
// Copy the values that were computed for the main encoder.
enc_side->histo_bits_ = enc_main->histo_bits_;
enc_side->transform_bits_ = enc_main->transform_bits_;
enc_side->predictor_transform_bits_ =
enc_main->predictor_transform_bits_;
enc_side->cross_color_transform_bits_ =
enc_main->cross_color_transform_bits_;
enc_side->palette_size_ = enc_main->palette_size_;
memcpy(enc_side->palette_, enc_main->palette_,
sizeof(enc_main->palette_));

View File

@ -34,7 +34,7 @@ extern "C" {
#endif
// maximum value of transform_bits_ in VP8LEncoder.
#define MAX_TRANSFORM_BITS 6
#define MAX_TRANSFORM_BITS (MIN_TRANSFORM_BITS + (1 << NUM_TRANSFORM_BITS) - 1)
typedef enum {
kEncoderNone = 0,
@ -59,7 +59,8 @@ typedef struct {
// Encoding parameters derived from quality parameter.
int histo_bits_;
int transform_bits_; // <= MAX_TRANSFORM_BITS.
int predictor_transform_bits_; // <= MAX_TRANSFORM_BITS
int cross_color_transform_bits_; // <= MAX_TRANSFORM_BITS
int cache_bits_; // If equal to 0, don't use color cache.
// Encoding parameters derived from image characteristics.
@ -104,16 +105,21 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
// pic and percent are for progress.
// Returns false in case of error (stored in pic->error_code).
int VP8LResidualImage(int width, int height, int bits, int low_effort,
uint32_t* const argb, uint32_t* const argb_scratch,
uint32_t* const image, int near_lossless, int exact,
int used_subtract_green, const WebPPicture* const pic,
int percent_range, int* const percent);
int VP8LResidualImage(int width, int height, int min_bits, int max_bits,
int low_effort, uint32_t* const argb,
uint32_t* const argb_scratch, uint32_t* const image,
int near_lossless, int exact, int used_subtract_green,
const WebPPicture* const pic, int percent_range,
int* const percent, int* const best_bits);
int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
uint32_t* const argb, uint32_t* image,
const WebPPicture* const pic, int percent_range,
int* const percent);
int* const percent, int* const best_bits);
void VP8LOptimizeSampling(uint32_t* const image, int full_width,
int full_height, int bits, int max_bits,
int* best_bits_out);
//------------------------------------------------------------------------------

View File

@ -191,7 +191,8 @@ int WebPAnimEncoderOptionsInitInternal(WebPAnimEncoderOptions* enc_options,
return 1;
}
// This starting value is more fit to WebPCleanupTransparentAreaLossless().
// This value is used to match a later call to WebPReplaceTransparentPixels(),
// making it a no-op for lossless (see WebPEncode()).
#define TRANSPARENT_COLOR 0x00000000
static void ClearRectangle(WebPPicture* const picture,

View File

@ -28,7 +28,7 @@ extern "C" {
// Defines and constants.
#define MUX_MAJ_VERSION 1
#define MUX_MIN_VERSION 4
#define MUX_MIN_VERSION 5
#define MUX_REV_VERSION 0
// Chunk object.

View File

@ -223,11 +223,13 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
// Note this padding is historical and differs from demux.c which does not
// pad the file size.
riff_size = SizeWithPadding(riff_size);
if (riff_size < CHUNK_HEADER_SIZE) goto Err;
// Make sure the whole RIFF header is available.
if (riff_size < RIFF_HEADER_SIZE) goto Err;
if (riff_size > size) goto Err;
// There's no point in reading past the end of the RIFF chunk.
if (size > riff_size + CHUNK_HEADER_SIZE) {
size = riff_size + CHUNK_HEADER_SIZE;
// There's no point in reading past the end of the RIFF chunk. Note riff_size
// includes CHUNK_HEADER_SIZE after SizeWithPadding().
if (size > riff_size) {
size = riff_size;
}
end = data + size;

View File

@ -124,7 +124,8 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
#if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \
defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64__) || defined(_M_X64)
defined(__x86_64__) || defined(_M_X64) || \
defined(__wasm__)
#define VP8L_USE_FAST_LOAD
#endif

View File

@ -69,6 +69,8 @@ extern "C" {
#define BITS 56
#elif defined(__mips__) // MIPS
#define BITS 24
#elif defined(__wasm__) // WASM
#define BITS 56
#else // reasonable default
#define BITS 24
#endif

View File

@ -191,6 +191,12 @@ static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
// Find greedily always the closest color of the predicted color to minimize
// deltas in the palette. This reduces storage needs since the
// palette is stored with delta encoding.
if (num_colors > 17) {
if (palette[0] == 0) {
--num_colors;
SwapColor(&palette[num_colors], &palette[0]);
}
}
for (i = 0; i < num_colors; ++i) {
int best_ix = i;
uint32_t best_score = ~0U;
@ -384,8 +390,13 @@ int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
uint32_t* const palette) {
switch (method) {
case kSortedDefault:
// Nothing to do, we have already sorted the palette.
memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
if (palette_sorted[0] == 0 && num_colors > 17) {
memcpy(palette, palette_sorted + 1,
(num_colors - 1) * sizeof(*palette_sorted));
palette[num_colors - 1] = 0;
} else {
memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
}
return 1;
case kMinimizeDelta:
PaletteSortMinimizeDeltas(palette_sorted, num_colors, palette);

View File

@ -53,6 +53,8 @@ int GetColorPalette(const struct WebPPicture* const pic,
// Sorts the palette according to the criterion defined by 'method'.
// 'palette_sorted' is the input palette sorted lexicographically, as done in
// PrepareMapToPalette. Returns 0 on memory allocation error.
// For kSortedDefault and kMinimizeDelta methods, 0 (if present) is set as the
// last element to optimize later storage.
int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
const uint32_t* const palette_sorted, uint32_t num_colors,
uint32_t* const palette);

View File

@ -20,7 +20,7 @@
extern "C" {
#endif
#define WEBP_ENCODER_ABI_VERSION 0x020f // MAJOR(8b) + MINOR(8b)
#define WEBP_ENCODER_ABI_VERSION 0x0210 // MAJOR(8b) + MINOR(8b)
// Note: forward declaring enumerations is not allowed in (strict) C and C++,
// the types are left here for reference.
@ -145,7 +145,7 @@ struct WebPConfig {
// RGB information for better compression. The default
// value is 0.
int use_delta_palette; // reserved for future lossless feature
int use_delta_palette; // reserved
int use_sharp_yuv; // if needed, use sharp (and slow) RGB->YUV conversion
int qmin; // minimum permissible quality factor
@ -224,14 +224,15 @@ struct WebPAuxStats {
uint32_t lossless_features; // bit0:predictor bit1:cross-color transform
// bit2:subtract-green bit3:color indexing
int histogram_bits; // number of precision bits of histogram
int transform_bits; // precision bits for transform
int transform_bits; // precision bits for predictor transform
int cache_bits; // number of bits for color cache lookup
int palette_size; // number of color in palette, if used
int lossless_size; // final lossless size
int lossless_hdr_size; // lossless header (transform, huffman etc) size
int lossless_data_size; // lossless image data size
int cross_color_transform_bits; // precision bits for cross-color transform
uint32_t pad[2]; // padding for later use
uint32_t pad[1]; // padding for later use
};
// Signature for output function. Should return true if writing was successful.

View File

@ -46,7 +46,12 @@
#define CODE_LENGTH_CODES 19
#define MIN_HUFFMAN_BITS 2 // min number of Huffman bits
#define MAX_HUFFMAN_BITS 9 // max number of Huffman bits
#define NUM_HUFFMAN_BITS 3
// the maximum number of bits defining a transform is
// MIN_TRANSFORM_BITS + (1 << NUM_TRANSFORM_BITS) - 1
#define MIN_TRANSFORM_BITS 2
#define NUM_TRANSFORM_BITS 3
#define TRANSFORM_PRESENT 1 // The bit to be written when next data
// to be read is a transform.

View File

@ -38,11 +38,11 @@ typedef long long int int64_t;
#ifndef WEBP_NODISCARD
#if defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD
#if (defined(__cplusplus) && __cplusplus >= 201700L) || \
#if (defined(__cplusplus) && __cplusplus >= 201703L) || \
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L)
#define WEBP_NODISCARD [[nodiscard]]
#else
// gcc's __has_attribute does not work for enums.
// gcc's __attribute__((warn_unused_result)) does not work for enums.
#if defined(__clang__) && defined(__has_attribute)
#if __has_attribute(warn_unused_result)
#define WEBP_NODISCARD __attribute__((warn_unused_result))