libwebp: Update to 1.5.0

This commit is contained in:
Rémi Verschelde 2025-01-09 15:17:51 +01:00
parent 1f787b63a5
commit 01f88ff138
No known key found for this signature in database
GPG Key ID: C3336907360768E1
89 changed files with 3131 additions and 2143 deletions

View File

@ -582,7 +582,7 @@ Files extracted from upstream source:
## libwebp ## libwebp
- Upstream: https://chromium.googlesource.com/webm/libwebp/ - Upstream: https://chromium.googlesource.com/webm/libwebp/
- Version: 1.4.0 (845d5476a866141ba35ac133f856fa62f0b7445f, 2024) - Version: 1.5.0 (a4d7a715337ded4451fec90ff8ce79728e04126c, 2024)
- License: BSD-3-Clause - License: BSD-3-Clause
Files extracted from upstream source: Files extracted from upstream source:

View File

@ -11,11 +11,13 @@ Contributors:
- Christopher Degawa (ccom at randomderp dot com) - Christopher Degawa (ccom at randomderp dot com)
- Clement Courbet (courbet at google dot com) - Clement Courbet (courbet at google dot com)
- Djordje Pesut (djordje dot pesut at imgtec dot com) - Djordje Pesut (djordje dot pesut at imgtec dot com)
- Frank (1433351828 at qq dot com)
- Frank Barchard (fbarchard at google dot com) - Frank Barchard (fbarchard at google dot com)
- Hui Su (huisu at google dot com) - Hui Su (huisu at google dot com)
- H. Vetinari (h dot vetinari at gmx dot com) - H. Vetinari (h dot vetinari at gmx dot com)
- Ilya Kurdyukov (jpegqs at gmail dot com) - Ilya Kurdyukov (jpegqs at gmail dot com)
- Ingvar Stepanyan (rreverser at google dot com) - Ingvar Stepanyan (rreverser at google dot com)
- Istvan Stefan (Istvan dot Stefan at arm dot com)
- James Zern (jzern at google dot com) - James Zern (jzern at google dot com)
- Jan Engelhardt (jengelh at medozas dot de) - Jan Engelhardt (jengelh at medozas dot de)
- Jehan (jehan at girinstud dot io) - Jehan (jehan at girinstud dot io)
@ -62,6 +64,7 @@ Contributors:
- Vincent Rabaud (vrabaud at google dot com) - Vincent Rabaud (vrabaud at google dot com)
- Vlad Tsyrklevich (vtsyrklevich at chromium dot org) - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
- Wan-Teh Chang (wtc at google dot com) - Wan-Teh Chang (wtc at google dot com)
- wrv (wrv at utexas dot edu)
- Yang Zhang (yang dot zhang at arm dot com) - Yang Zhang (yang dot zhang at arm dot com)
- Yannis Guyon (yguyon at google dot com) - Yannis Guyon (yguyon at google dot com)
- Zhi An Ng (zhin at chromium dot org) - Zhi An Ng (zhin at chromium dot org)

View File

@ -565,10 +565,11 @@ int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix); scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix); scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, return DoSharpArgbToYuv(
rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride, (const uint8_t*)r_ptr, (const uint8_t*)g_ptr, (const uint8_t*)b_ptr,
v_ptr, v_stride, yuv_bit_depth, width, height, rgb_step, rgb_stride, rgb_bit_depth, (uint8_t*)y_ptr, y_stride,
&scaled_matrix, transfer_type); (uint8_t*)u_ptr, u_stride, (uint8_t*)v_ptr, v_stride, yuv_bit_depth,
width, height, &scaled_matrix, transfer_type);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -52,7 +52,7 @@ extern "C" {
// SharpYUV API version following the convention from semver.org // SharpYUV API version following the convention from semver.org
#define SHARPYUV_VERSION_MAJOR 0 #define SHARPYUV_VERSION_MAJOR 0
#define SHARPYUV_VERSION_MINOR 4 #define SHARPYUV_VERSION_MINOR 4
#define SHARPYUV_VERSION_PATCH 0 #define SHARPYUV_VERSION_PATCH 1
// Version as a uint32_t. The major number is the high 8 bits. // Version as a uint32_t. The major number is the high 8 bits.
// The minor number is the middle 8 bits. The patch number is the low 16 bits. // The minor number is the middle 8 bits. The patch number is the low 16 bits.
#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \ #define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
@ -66,10 +66,17 @@ extern "C" {
SHARPYUV_EXTERN int SharpYuvGetVersion(void); SHARPYUV_EXTERN int SharpYuvGetVersion(void);
// RGB to YUV conversion matrix, in 16 bit fixed point. // RGB to YUV conversion matrix, in 16 bit fixed point.
// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3] // y_ = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3] // u_ = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3] // v_ = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
// Then y, u and v values are divided by 1<<16 and rounded. // Then the values are divided by 1<<16 and rounded.
// y = (y_ + (1 << 15)) >> 16
// u = (u_ + (1 << 15)) >> 16
// v = (v_ + (1 << 15)) >> 16
//
// Typically, the offset values rgb_to_y[3], rgb_to_u[3] and rgb_to_v[3] depend
// on the input's bit depth, e.g., rgb_to_u[3] = 1 << (rgb_bit_depth - 1 + 16).
// See also sharpyuv_csp.h to get a predefined matrix or generate a matrix.
typedef struct { typedef struct {
int rgb_to_y[4]; int rgb_to_y[4];
int rgb_to_u[4]; int rgb_to_u[4];
@ -127,6 +134,8 @@ typedef enum SharpYuvTransferFunctionType {
// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they // adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
// should be multiples of 2. // should be multiples of 2.
// width, height: width and height of the image in pixels // width, height: width and height of the image in pixels
// yuv_matrix: RGB to YUV conversion matrix. The matrix values typically
// depend on the input's rgb_bit_depth.
// This function calls SharpYuvConvertWithOptions with a default transfer // This function calls SharpYuvConvertWithOptions with a default transfer
// function of kSharpYuvTransferFunctionSrgb. // function of kSharpYuvTransferFunctionSrgb.
SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr, SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,

View File

@ -22,16 +22,16 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
const float kr = yuv_color_space->kr; const float kr = yuv_color_space->kr;
const float kb = yuv_color_space->kb; const float kb = yuv_color_space->kb;
const float kg = 1.0f - kr - kb; const float kg = 1.0f - kr - kb;
const float cr = 0.5f / (1.0f - kb); const float cb = 0.5f / (1.0f - kb);
const float cb = 0.5f / (1.0f - kr); const float cr = 0.5f / (1.0f - kr);
const int shift = yuv_color_space->bit_depth - 8; const int shift = yuv_color_space->bit_depth - 8;
const float denom = (float)((1 << yuv_color_space->bit_depth) - 1); const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
float scale_y = 1.0f; float scale_y = 1.0f;
float add_y = 0.0f; float add_y = 0.0f;
float scale_u = cr; float scale_u = cb;
float scale_v = cb; float scale_v = cr;
float add_uv = (float)(128 << shift); float add_uv = (float)(128 << shift);
assert(yuv_color_space->bit_depth >= 8); assert(yuv_color_space->bit_depth >= 8);
@ -59,31 +59,35 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
} }
// Matrices are in YUV_FIX fixed point precision. // Matrices are in YUV_FIX fixed point precision.
// WebP's matrix, similar but not identical to kRec601LimitedMatrix. // WebP's matrix, similar but not identical to kRec601LimitedMatrix
// Derived using the following formulas:
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
static const SharpYuvConversionMatrix kWebpMatrix = { static const SharpYuvConversionMatrix kWebpMatrix = {
{16839, 33059, 6420, 16 << 16}, {16839, 33059, 6420, 16 << 16},
{-9719, -19081, 28800, 128 << 16}, {-9719, -19081, 28800, 128 << 16},
{28800, -24116, -4684, 128 << 16}, {28800, -24116, -4684, 128 << 16},
}; };
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited // Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec601LimitedMatrix = { static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
{16829, 33039, 6416, 16 << 16}, {16829, 33039, 6416, 16 << 16},
{-9714, -19071, 28784, 128 << 16}, {-9714, -19071, 28784, 128 << 16},
{28784, -24103, -4681, 128 << 16}, {28784, -24103, -4681, 128 << 16},
}; };
// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull // Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec601FullMatrix = { static const SharpYuvConversionMatrix kRec601FullMatrix = {
{19595, 38470, 7471, 0}, {19595, 38470, 7471, 0},
{-11058, -21710, 32768, 128 << 16}, {-11058, -21710, 32768, 128 << 16},
{32768, -27439, -5329, 128 << 16}, {32768, -27439, -5329, 128 << 16},
}; };
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited // Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec709LimitedMatrix = { static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
{11966, 40254, 4064, 16 << 16}, {11966, 40254, 4064, 16 << 16},
{-6596, -22189, 28784, 128 << 16}, {-6596, -22189, 28784, 128 << 16},
{28784, -26145, -2639, 128 << 16}, {28784, -26145, -2639, 128 << 16},
}; };
// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull // Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec709FullMatrix = { static const SharpYuvConversionMatrix kRec709FullMatrix = {
{13933, 46871, 4732, 0}, {13933, 46871, 4732, 0},
{-7509, -25259, 32768, 128 << 16}, {-7509, -25259, 32768, 128 << 16},

View File

@ -41,10 +41,15 @@ SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
// Enums for precomputed conversion matrices. // Enums for precomputed conversion matrices.
typedef enum { typedef enum {
// WebP's matrix, similar but not identical to kSharpYuvMatrixRec601Limited
kSharpYuvMatrixWebp = 0, kSharpYuvMatrixWebp = 0,
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
kSharpYuvMatrixRec601Limited, kSharpYuvMatrixRec601Limited,
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
kSharpYuvMatrixRec601Full, kSharpYuvMatrixRec601Full,
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
kSharpYuvMatrixRec709Limited, kSharpYuvMatrixRec709Limited,
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
kSharpYuvMatrixRec709Full, kSharpYuvMatrixRec709Full,
kSharpYuvMatrixNum kSharpYuvMatrixNum
} SharpYuvMatrixType; } SharpYuvMatrixType;

View File

@ -16,7 +16,8 @@
#include "src/utils/bit_reader_inl_utils.h" #include "src/utils/bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE) #if !defined(USE_GENERIC_TREE)
#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64 #if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64 && \
!defined(__wasm__)
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then. // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE #define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else #else

View File

@ -32,7 +32,7 @@ extern "C" {
// version numbers // version numbers
#define DEC_MAJ_VERSION 1 #define DEC_MAJ_VERSION 1
#define DEC_MIN_VERSION 4 #define DEC_MIN_VERSION 5
#define DEC_REV_VERSION 0 #define DEC_REV_VERSION 0
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline). // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).

View File

@ -20,10 +20,9 @@
#include "src/dsp/dsp.h" #include "src/dsp/dsp.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
#include "src/utils/endian_inl_utils.h"
#include "src/utils/huffman_utils.h" #include "src/utils/huffman_utils.h"
#include "src/utils/utils.h" #include "src/utils/utils.h"
#include "src/webp/format_constants.h"
#define NUM_ARGB_CACHE_ROWS 16 #define NUM_ARGB_CACHE_ROWS 16
@ -381,7 +380,8 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
if (allow_recursion && VP8LReadBits(br, 1)) { if (allow_recursion && VP8LReadBits(br, 1)) {
// use meta Huffman codes. // use meta Huffman codes.
const int huffman_precision = VP8LReadBits(br, 3) + 2; const int huffman_precision =
MIN_HUFFMAN_BITS + VP8LReadBits(br, NUM_HUFFMAN_BITS);
const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision); const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision); const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
const int huffman_pixs = huffman_xsize * huffman_ysize; const int huffman_pixs = huffman_xsize * huffman_ysize;
@ -1351,7 +1351,8 @@ static int ReadTransform(int* const xsize, int const* ysize,
switch (type) { switch (type) {
case PREDICTOR_TRANSFORM: case PREDICTOR_TRANSFORM:
case CROSS_COLOR_TRANSFORM: case CROSS_COLOR_TRANSFORM:
transform->bits_ = VP8LReadBits(br, 3) + 2; transform->bits_ =
MIN_TRANSFORM_BITS + VP8LReadBits(br, NUM_TRANSFORM_BITS);
ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_, ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
transform->bits_), transform->bits_),
VP8LSubSampleSize(transform->ysize_, VP8LSubSampleSize(transform->ysize_,
@ -1416,7 +1417,9 @@ VP8LDecoder* VP8LNew(void) {
return dec; return dec;
} }
void VP8LClear(VP8LDecoder* const dec) { // Resets the decoder in its initial state, reclaiming memory.
// Preserves the dec->status_ value.
static void VP8LClear(VP8LDecoder* const dec) {
int i; int i;
if (dec == NULL) return; if (dec == NULL) return;
ClearMetadata(&dec->hdr_); ClearMetadata(&dec->hdr_);

View File

@ -121,10 +121,6 @@ WEBP_NODISCARD int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
// this function. Returns false in case of error, with updated dec->status_. // this function. Returns false in case of error, with updated dec->status_.
WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec); WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec);
// Resets the decoder in its initial state, reclaiming memory.
// Preserves the dec->status_ value.
void VP8LClear(VP8LDecoder* const dec);
// Clears and deallocate a lossless decoder instance. // Clears and deallocate a lossless decoder instance.
void VP8LDelete(VP8LDecoder* const dec); void VP8LDelete(VP8LDecoder* const dec);

View File

@ -24,7 +24,7 @@
#include "src/webp/format_constants.h" #include "src/webp/format_constants.h"
#define DMUX_MAJ_VERSION 1 #define DMUX_MAJ_VERSION 1
#define DMUX_MIN_VERSION 4 #define DMUX_MIN_VERSION 5
#define DMUX_REV_VERSION 0 #define DMUX_REV_VERSION 0
typedef struct { typedef struct {

View File

@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
return cost; return cost;
} }
static void SetResidualCoeffs_C(const int16_t* const coeffs, static void SetResidualCoeffs_C(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* const res) { VP8Residual* WEBP_RESTRICT const res) {
int n; int n;
res->last = -1; res->last = -1;
assert(res->first == 0 || coeffs[0] == 0); assert(res->first == 0 || coeffs[0] == 0);

View File

@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
return cost; return cost;
} }
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs, static void SetResidualCoeffs_MIPS32(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* const res) { VP8Residual* WEBP_RESTRICT const res) {
const int16_t* p_coeffs = (int16_t*)coeffs; const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1; int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0); assert(res->first == 0 || coeffs[0] == 0);

View File

@ -19,8 +19,8 @@
static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8, static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16 }; 9, 10, 11, 12, 13, 14, 15, 16 };
static void SetResidualCoeffs_NEON(const int16_t* const coeffs, static void SetResidualCoeffs_NEON(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* const res) { VP8Residual* WEBP_RESTRICT const res) {
const int16x8_t minus_one = vdupq_n_s16(-1); const int16x8_t minus_one = vdupq_n_s16(-1);
const int16x8_t coeffs_0 = vld1q_s16(coeffs); const int16x8_t coeffs_0 = vld1q_s16(coeffs);
const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8); const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);

View File

@ -22,8 +22,8 @@
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs, static void SetResidualCoeffs_SSE2(const int16_t* WEBP_RESTRICT const coeffs,
VP8Residual* const res) { VP8Residual* WEBP_RESTRICT const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0)); const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction. // Use SSE2 to compare 16 values with a single instruction.

View File

@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
} while (0) } while (0)
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void TransformOne_C(const int16_t* in, uint8_t* dst) { static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int C[4 * 4], *tmp; int C[4 * 4], *tmp;
int i; int i;
tmp = C; tmp = C;
@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
} }
// Simplified transform when only in[0], in[1] and in[4] are non-zero // Simplified transform when only in[0], in[1] and in[4] are non-zero
static void TransformAC3_C(const int16_t* in, uint8_t* dst) { static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4; const int a = in[0] + 4;
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
} }
#undef STORE2 #undef STORE2
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne_C(in, dst); TransformOne_C(in, dst);
if (do_two) { if (do_two) {
TransformOne_C(in + 16, dst + 4); TransformOne_C(in + 16, dst + 4);
@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
static void TransformUV_C(const int16_t* in, uint8_t* dst) { static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
VP8Transform(in + 0 * 16, dst, 1); VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1); VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
} }
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void TransformDC_C(const int16_t* in, uint8_t* dst) { static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int DC = in[0] + 4; const int DC = in[0] + 4;
int i, j; int i, j;
for (j = 0; j < 4; ++j) { for (j = 0; j < 4; ++j) {
@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst); if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4); if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS); if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
// Paragraph 14.3 // Paragraph 14.3
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void TransformWHT_C(const int16_t* in, int16_t* out) { static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int tmp[16]; int tmp[16];
int i; int i;
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out); VP8WHT VP8TransformWHT;
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Intra predictions // Intra predictions
@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering // 8-pixels wide variant, for chroma filtering
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride, static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride, static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
} }
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC #endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride, static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
} }
@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst, static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
int dst_stride) { uint8_t* WEBP_RESTRICT dst, int dst_stride) {
int i, j; int i, j;
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
VP8SimpleFilterFunc VP8SimpleVFilter16i; VP8SimpleFilterFunc VP8SimpleVFilter16i;
VP8SimpleFilterFunc VP8SimpleHFilter16i; VP8SimpleFilterFunc VP8SimpleHFilter16i;
void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
int dst_stride); uint8_t* WEBP_RESTRICT dst, int dst_stride);
extern VP8CPUInfo VP8GetCPUInfo; extern VP8CPUInfo VP8GetCPUInfo;
extern void VP8DspInitSSE2(void); extern void VP8DspInitSSE2(void);

View File

@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
} }
// 8-pixels wide variant, for chroma filtering // 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride, static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
} }
static void HFilter8(uint8_t* u, uint8_t* v, int stride, static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
} }
static void VFilter8i(uint8_t* u, uint8_t* v, int stride, static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
} }
static void HFilter8i(uint8_t* u, uint8_t* v, int stride, static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
} }
@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
} }
} }
static void TransformOne(const int16_t* in, uint8_t* dst) { static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp0, temp1, temp2, temp3, temp4; int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9; int temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14; int temp10, temp11, temp12, temp13, temp14;
@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
); );
} }
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst); TransformOne(in, dst);
if (do_two) { if (do_two) {
TransformOne(in + 16, dst + 4); TransformOne(in + 16, dst + 4);

View File

@ -21,7 +21,8 @@
static const int kC1 = WEBP_TRANSFORM_AC3_C1; static const int kC1 = WEBP_TRANSFORM_AC3_C1;
static const int kC2 = WEBP_TRANSFORM_AC3_C2; static const int kC2 = WEBP_TRANSFORM_AC3_C2;
static void TransformDC(const int16_t* in, uint8_t* dst) { static void TransformDC(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
__asm__ volatile ( __asm__ volatile (
@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
); );
} }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static void TransformAC3(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4; const int a = in[0] + 4;
int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
); );
} }
static void TransformOne(const int16_t* in, uint8_t* dst) { static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
); );
} }
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst); TransformOne(in, dst);
if (do_two) { if (do_two) {
TransformOne(in + 16, dst + 4); TransformOne(in + 16, dst + 4);
@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
} }
// 8-pixels wide variant, for chroma filtering // 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride, static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
} }
static void HFilter8(uint8_t* u, uint8_t* v, int stride, static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
} }
@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride,
} }
} }
static void VFilter8i(uint8_t* u, uint8_t* v, int stride, static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
} }
static void HFilter8i(uint8_t* u, uint8_t* v, int stride, static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
} }

View File

@ -38,7 +38,8 @@
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
} }
static void TransformOne(const int16_t* in, uint8_t* dst) { static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
v8i16 input0, input1; v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3; v4i32 res0, res1, res2, res3;
@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
} }
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst); TransformOne(in, dst);
if (do_two) { if (do_two) {
TransformOne(in + 16, dst + 4); TransformOne(in + 16, dst + 4);
} }
} }
static void TransformWHT(const int16_t* in, int16_t* out) { static void TransformWHT(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
v8i16 input0, input1; v8i16 input0, input1;
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
out[240] = __msa_copy_s_h(out1, 7); out[240] = __msa_copy_s_h(out1, 7);
} }
static void TransformDC(const int16_t* in, uint8_t* dst) { static void TransformDC(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int DC = (in[0] + 4) >> 3; const int DC = (in[0] + 4) >> 3;
const v8i16 tmp0 = __msa_fill_h(DC); const v8i16 tmp0 = __msa_fill_h(DC);
ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS); ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
} }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static void TransformAC3(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4; const int a = in[0] + 4;
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
} }
// 8-pixels wide variants, for chroma filtering // 8-pixels wide variants, for chroma filtering
static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride, static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
int b_limit_in, int limit_in, int thresh_in) { int stride, int b_limit_in, int limit_in, int thresh_in) {
uint8_t* ptmp_src_u = src_u - 4 * stride; uint8_t* ptmp_src_u = src_u - 4 * stride;
uint8_t* ptmp_src_v = src_v - 4 * stride; uint8_t* ptmp_src_v = src_v - 4 * stride;
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
SD(q2_d, ptmp_src_v); SD(q2_d, ptmp_src_v);
} }
static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride, static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
int b_limit_in, int limit_in, int thresh_in) { int stride, int b_limit_in, int limit_in, int thresh_in) {
uint8_t* ptmp_src_u = src_u - 4; uint8_t* ptmp_src_u = src_u - 4;
uint8_t* ptmp_src_v = src_v - 4; uint8_t* ptmp_src_v = src_v - 4;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride); ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
} }
static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
uint8_t* WEBP_RESTRICT src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) { int b_limit_in, int limit_in, int thresh_in) {
uint64_t p1_d, p0_d, q0_d, q1_d; uint64_t p1_d, p0_d, q0_d, q1_d;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride); SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
} }
static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
uint8_t* WEBP_RESTRICT src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) { int b_limit_in, int limit_in, int thresh_in) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;

View File

@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
#endif // !WORK_AROUND_GCC #endif // !WORK_AROUND_GCC
// 8-pixels wide variant, for chroma filtering // 8-pixels wide variant, for chroma filtering
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{ {
@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
} }
} }
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4 * stride; u += 4 * stride;
@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
} }
#if !defined(WORK_AROUND_GCC) #if !defined(WORK_AROUND_GCC)
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{ {
@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
} }
} }
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4; u += 4;
@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
Transpose8x2_NEON(E0, E1, rows); Transpose8x2_NEON(E0, E1, rows);
} }
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int16x8x2_t rows; int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows); TransformPass_NEON(&rows);
@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
#else #else
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int kBPS = BPS; const int kBPS = BPS;
// kC1, kC2. Padded because vld1.16 loads 8 bytes // kC1, kC2. Padded because vld1.16 loads 8 bytes
const int16_t constants[4] = { kC1, kC2, 0, 0 }; const int16_t constants[4] = { kC1, kC2, 0, 0 };
@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
#endif // WEBP_USE_INTRINSICS #endif // WEBP_USE_INTRINSICS
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) { static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne_NEON(in, dst); TransformOne_NEON(in, dst);
if (do_two) { if (do_two) {
TransformOne_NEON(in + 16, dst + 4); TransformOne_NEON(in + 16, dst + 4);
} }
} }
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int16x8_t DC = vdupq_n_s16(in[0]); const int16x8_t DC = vdupq_n_s16(in[0]);
Add4x4_NEON(DC, DC, dst); Add4x4_NEON(DC, DC, dst);
} }
@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
} while (0) } while (0)
static void TransformWHT_NEON(const int16_t* in, int16_t* out) { static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int32x4x4_t tmp; int32x4x4_t tmp;
{ {
@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int16x4_t A = vld1_dup_s16(in); const int16x4_t A = vld1_dup_s16(in);
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4])); const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4])); const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@ -1300,18 +1308,19 @@ static void DC4_NEON(uint8_t* dst) { // DC
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) { static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] const uint16x8_t d = vsubl_u8(T, TL); // A[c] - A[-1]
int y; int y;
for (y = 0; y < size; y += 4) { for (y = 0; y < size; y += 4) {
// left edge // left edge
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] // L[r] + A[c] - A[-1]
const int16x8_t r1 = vaddq_s16(L1, d); const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
const int16x8_t r2 = vaddq_s16(L2, d); const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
const int16x8_t r3 = vaddq_s16(L3, d); const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
// Saturate and store the result. // Saturate and store the result.
const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
@ -1572,23 +1581,24 @@ static void TM16_NEON(uint8_t* dst) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
// A[c] - A[-1] // A[c] - A[-1]
const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL)); const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL)); const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
int y; int y;
for (y = 0; y < 16; y += 4) { for (y = 0; y < 16; y += 4) {
// left edge // left edge
const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] // L[r] + A[c] - A[-1]
const int16x8_t r1_lo = vaddq_s16(L1, d_lo); const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
const int16x8_t r2_lo = vaddq_s16(L2, d_lo); const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
const int16x8_t r3_lo = vaddq_s16(L3, d_lo); const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
const int16x8_t r0_hi = vaddq_s16(L0, d_hi); const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
const int16x8_t r1_hi = vaddq_s16(L1, d_hi); const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
const int16x8_t r2_hi = vaddq_s16(L2, d_hi); const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
const int16x8_t r3_hi = vaddq_s16(L3, d_hi); const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
// Saturate and store the result. // Saturate and store the result.
const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi)); const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi)); const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));

View File

@ -30,7 +30,8 @@
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Transforms (Paragraph 14.4) // Transforms (Paragraph 14.4)
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two // This implementation makes use of 16-bit fixed point versions of two
// multiply constants: // multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
#if (USE_TRANSFORM_AC3 == 1) #if (USE_TRANSFORM_AC3 == 1)
static void TransformAC3(const int16_t* in, uint8_t* dst) { static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i A = _mm_set1_epi16(in[0] + 4);
const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4])); const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4])); const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
} }
// 8-pixels wide variant, for chroma filtering // 8-pixels wide variant, for chroma filtering
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
__m128i mask; __m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2; __m128i t1, p2, p1, p0, q0, q1, q2;
@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride); STOREUV(q2, u, v, 2 * stride);
} }
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int thresh, int ithresh, int hev_thresh) { int stride, int thresh, int ithresh, int hev_thresh) {
__m128i mask; __m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i p3, p2, p1, p0, q0, q1, q2, q3;
@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride); Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
} }
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
__m128i mask; __m128i mask;
__m128i t1, t2, p1, p0, q0, q1; __m128i t1, t2, p1, p0, q0, q1;
@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride); STOREUV(q1, u, v, 1 * stride);
} }
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
__m128i mask; __m128i mask;
__m128i t1, t2, p1, p0, q0, q1; __m128i t1, t2, p1, p0, q0, q1;

View File

@ -60,53 +60,66 @@ extern "C" {
// Transforms // Transforms
// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
// will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4). // will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref,
int do_two); const int16_t* WEBP_RESTRICT in,
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); uint8_t* WEBP_RESTRICT dst, int do_two);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out); typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out);
typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out);
extern VP8Idct VP8ITransform; extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform; extern VP8Fdct VP8FTransform;
extern VP8Fdct VP8FTransform2; // performs two transforms at a time extern VP8Fdct VP8FTransform2; // performs two transforms at a time
extern VP8WHT VP8FTransformWHT; extern VP8WHT VP8FTransformWHT;
// Predictions // Predictions
// *dst is the destination block. *top and *left can be NULL. // *dst is the destination block. *top and *left can be NULL.
typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left, typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top); const uint8_t* WEBP_RESTRICT left,
typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top); const uint8_t* WEBP_RESTRICT top);
typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top);
extern VP8Intra4Preds VP8EncPredLuma4; extern VP8Intra4Preds VP8EncPredLuma4;
extern VP8IntraPreds VP8EncPredLuma16; extern VP8IntraPreds VP8EncPredLuma16;
extern VP8IntraPreds VP8EncPredChroma8; extern VP8IntraPreds VP8EncPredChroma8;
typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix,
const uint8_t* WEBP_RESTRICT ref);
extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix,
const uint16_t* const weights); const uint8_t* WEBP_RESTRICT ref,
const uint16_t* WEBP_RESTRICT const weights);
// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
// 4 by 4 symmetric matrix. // 4 by 4 symmetric matrix.
extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
// Compute the average (DC) of four 4x4 blocks. // Compute the average (DC) of four 4x4 blocks.
// Each sub-4x4 block #i sum is stored in dc[i]. // Each sub-4x4 block #i sum is stored in dc[i].
typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]); typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref,
uint32_t dc[4]);
extern VP8MeanMetric VP8Mean16x4; extern VP8MeanMetric VP8Mean16x4;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst);
extern VP8BlockCopy VP8Copy4x4; extern VP8BlockCopy VP8Copy4x4;
extern VP8BlockCopy VP8Copy16x8; extern VP8BlockCopy VP8Copy16x8;
// Quantization // Quantization
struct VP8Matrix; // forward declaration struct VP8Matrix; // forward declaration
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], typedef int (*VP8QuantizeBlock)(
const struct VP8Matrix* const mtx); int16_t in[16], int16_t out[16],
const struct VP8Matrix* WEBP_RESTRICT const mtx);
// Same as VP8QuantizeBlock, but quantizes two consecutive blocks. // Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32], typedef int (*VP8Quantize2Blocks)(
const struct VP8Matrix* const mtx); int16_t in[32], int16_t out[32],
const struct VP8Matrix* WEBP_RESTRICT const mtx);
extern VP8QuantizeBlock VP8EncQuantizeBlock; extern VP8QuantizeBlock VP8EncQuantizeBlock;
extern VP8Quantize2Blocks VP8EncQuantize2Blocks; extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
// specific to 2nd transform: // specific to 2nd transform:
typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16], typedef int (*VP8QuantizeBlockWHT)(
const struct VP8Matrix* const mtx); int16_t in[16], int16_t out[16],
const struct VP8Matrix* WEBP_RESTRICT const mtx);
extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
extern const int VP8DspScan[16 + 4 + 4]; extern const int VP8DspScan[16 + 4 + 4];
@ -118,9 +131,10 @@ typedef struct {
int max_value; int max_value;
int last_non_zero; int last_non_zero;
} VP8Histogram; } VP8Histogram;
typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred, typedef void (*VP8CHisto)(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block, int start_block, int end_block,
VP8Histogram* const histo); VP8Histogram* WEBP_RESTRICT const histo);
extern VP8CHisto VP8CollectHistogram; extern VP8CHisto VP8CollectHistogram;
// General-purpose util function to help VP8CollectHistogram(). // General-purpose util function to help VP8CollectHistogram().
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
@ -138,8 +152,9 @@ extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
extern const uint8_t VP8EncBands[16 + 1]; extern const uint8_t VP8EncBands[16 + 1];
struct VP8Residual; struct VP8Residual;
typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs, typedef void (*VP8SetResidualCoeffsFunc)(
struct VP8Residual* const res); const int16_t* WEBP_RESTRICT const coeffs,
struct VP8Residual* WEBP_RESTRICT const res);
extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs; extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
// Cost calculation function. // Cost calculation function.
@ -193,9 +208,11 @@ void VP8SSIMDspInit(void);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Decoding // Decoding
typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
uint8_t* WEBP_RESTRICT dst);
// when doing two transforms, coeffs is actually int16_t[2][16]. // when doing two transforms, coeffs is actually int16_t[2][16].
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two); typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
uint8_t* WEBP_RESTRICT dst, int do_two);
extern VP8DecIdct2 VP8Transform; extern VP8DecIdct2 VP8Transform;
extern VP8DecIdct VP8TransformAC3; extern VP8DecIdct VP8TransformAC3;
extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformUV;
@ -233,7 +250,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
// regular filter (on both macroblock edges and inner edges) // regular filter (on both macroblock edges and inner edges)
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride, typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
int thresh, int ithresh, int hev_t); int thresh, int ithresh, int hev_t);
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride, typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int stride,
int thresh, int ithresh, int hev_t); int thresh, int ithresh, int hev_t);
// on outer edge // on outer edge
extern VP8LumaFilterFunc VP8VFilter16; extern VP8LumaFilterFunc VP8VFilter16;
@ -253,8 +271,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1)) #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
#define VP8_DITHER_AMP_BITS 7 #define VP8_DITHER_AMP_BITS 7
#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS) #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
int dst_stride); uint8_t* WEBP_RESTRICT dst, int dst_stride);
// must be called before anything using the above // must be called before anything using the above
void VP8DspInit(void); void VP8DspInit(void);
@ -267,10 +285,10 @@ void VP8DspInit(void);
// Convert a pair of y/u/v lines together to the output rgb/a colorspace. // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
// bottom_y can be NULL if only one line of output is needed (at top/bottom). // bottom_y can be NULL if only one line of output is needed (at top/bottom).
typedef void (*WebPUpsampleLinePairFunc)( typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y, const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
const uint8_t* top_u, const uint8_t* top_v, const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
const uint8_t* cur_u, const uint8_t* cur_v, const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len); uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);
#ifdef FANCY_UPSAMPLING #ifdef FANCY_UPSAMPLING
@ -280,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
#endif // FANCY_UPSAMPLING #endif // FANCY_UPSAMPLING
// Per-row point-sampling methods. // Per-row point-sampling methods.
typedef void (*WebPSamplerRowFunc)(const uint8_t* y, typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len); const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
// Generic function to apply 'WebPSamplerRowFunc' to the whole plane: // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int dst_stride, const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func); int width, int height, WebPSamplerRowFunc func);
// Sampling functions to convert rows of YUV to RGB(A) // Sampling functions to convert rows of YUV to RGB(A)
@ -298,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last); WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
// YUV444->RGB converters // YUV444->RGB converters
typedef void (*WebPYUV444Converter)(const uint8_t* y, typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len); const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
@ -316,26 +337,35 @@ void WebPInitYUV444Converters(void);
// ARGB -> YUV converters // ARGB -> YUV converters
// Convert ARGB samples to luma Y. // Convert ARGB samples to luma Y.
extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width);
// Convert ARGB samples to U/V with downsampling. do_store should be '1' for // Convert ARGB samples to U/V with downsampling. do_store should be '1' for
// even lines and '0' for odd ones. 'src_width' is the original width, not // even lines and '0' for odd ones. 'src_width' is the original width, not
// the U/V one. // the U/V one.
extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store); int src_width, int do_store);
// Convert a row of accumulated (four-values) of rgba32 toward U/V // Convert a row of accumulated (four-values) of rgba32 toward U/V
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* u, uint8_t* v, int width); uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
// Convert RGB or BGR to Y // Convert RGB or BGR to Y
extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); uint8_t* WEBP_RESTRICT y, int width);
extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width);
// used for plain-C fallback. // used for plain-C fallback.
extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store); int src_width, int do_store);
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* u, uint8_t* v, int width); uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
// Must be called before using the above. // Must be called before using the above.
void WebPInitConvertARGBToYUV(void); void WebPInitConvertARGBToYUV(void);
@ -348,8 +378,9 @@ struct WebPRescaler;
// Import a row of data and save its contribution in the rescaler. // Import a row of data and save its contribution in the rescaler.
// 'channel' denotes the channel number to be imported. 'Expand' corresponds to // 'channel' denotes the channel number to be imported. 'Expand' corresponds to
// the wrk->x_expand case. Otherwise, 'Shrink' is to be used. // the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk, typedef void (*WebPRescalerImportRowFunc)(
const uint8_t* src); struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand; extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink; extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
@ -362,16 +393,19 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink; extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
// Plain-C implementation, as fall-back. // Plain-C implementation, as fall-back.
extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk, extern void WebPRescalerImportRowExpand_C(
const uint8_t* src); struct WebPRescaler* WEBP_RESTRICT const wrk,
extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk, const uint8_t* WEBP_RESTRICT src);
const uint8_t* src); extern void WebPRescalerImportRowShrink_C(
struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk); extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk); extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
// Main entry calls: // Main entry calls:
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk, extern void WebPRescalerImportRow(
const uint8_t* src); struct WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src);
// Export one row (starting at x_out position) from rescaler. // Export one row (starting at x_out position) from rescaler.
extern void WebPRescalerExportRow(struct WebPRescaler* const wrk); extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
@ -480,8 +514,9 @@ typedef enum { // Filter types.
WEBP_FILTER_FAST WEBP_FILTER_FAST
} WEBP_FILTER_TYPE; } WEBP_FILTER_TYPE;
typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height, typedef void (*WebPFilterFunc)(const uint8_t* WEBP_RESTRICT in,
int stride, uint8_t* out); int width, int height, int stride,
uint8_t* WEBP_RESTRICT out);
// In-place un-filtering. // In-place un-filtering.
// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'. // Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds, typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,

View File

@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
} }
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred, static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block, int start_block, int end_block,
VP8Histogram* const histo) { VP8Histogram* WEBP_RESTRICT const histo) {
int j; int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 }; int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) { for (j = start_block; j < end_block; ++j) {
@ -109,8 +110,9 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
#define STORE(x, y, v) \ #define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int C[4 * 4], *tmp; int C[4 * 4], *tmp;
int i; int i;
tmp = C; tmp = C;
@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
} }
} }
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst,
int do_two) { int do_two) {
ITransformOne(ref, in, dst); ITransformOne(ref, in, dst);
if (do_two) { if (do_two) {
@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
} }
} }
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
int i; int i;
int tmp[16]; int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2_C(const uint8_t* src, const uint8_t* ref, static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
VP8FTransform(src, ref, out); VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16); VP8FTransform(src + 4, ref + 4, out + 16);
} }
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) { static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
// input is 12b signed // input is 12b signed
int32_t tmp[16]; int32_t tmp[16];
int i; int i;
@ -234,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
} }
} }
static WEBP_INLINE void VerticalPred(uint8_t* dst, static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top, int size) { const uint8_t* WEBP_RESTRICT top,
int size) {
int j; int j;
if (top != NULL) { if (top != NULL) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@ -244,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
} }
} }
static WEBP_INLINE void HorizontalPred(uint8_t* dst, static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
const uint8_t* left, int size) { const uint8_t* WEBP_RESTRICT left,
int size) {
if (left != NULL) { if (left != NULL) {
int j; int j;
for (j = 0; j < size; ++j) { for (j = 0; j < size; ++j) {
@ -256,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
} }
} }
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top, int size) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top, int size) {
int y; int y;
if (left != NULL) { if (left != NULL) {
if (top != NULL) { if (top != NULL) {
@ -286,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
} }
} }
static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top, const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top,
int size, int round, int shift) { int size, int round, int shift) {
int DC = 0; int DC = 0;
int j; int j;
@ -312,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2) // Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block // U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4); DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8); VerticalPred(C8VE8 + dst, top, 8);
@ -332,22 +345,28 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3) // luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_C(uint8_t* dst, #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
const uint8_t* left, const uint8_t* top) { static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5); DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16); VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16); HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16); TrueMotion(I16TM16 + dst, left, top, 16);
} }
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// luma 4x4 prediction // luma 4x4 prediction
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
#define DST(x, y) dst[(x) + (y) * BPS] #define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1) #define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst, const uint8_t* top) { // vertical // vertical
static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const uint8_t vals[4] = { const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]), AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]), AVG3(top[ 0], top[1], top[2]),
@ -360,7 +379,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
} }
} }
static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal // horizontal
static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -372,14 +392,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
} }
static void DC4(uint8_t* dst, const uint8_t* top) { static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
uint32_t dc = 4; uint32_t dc = 4;
int i; int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4); Fill(dst, dc >> 3, 4);
} }
static void RD4(uint8_t* dst, const uint8_t* top) { static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -398,7 +418,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
DST(3, 0) = AVG3(D, C, B); DST(3, 0) = AVG3(D, C, B);
} }
static void LD4(uint8_t* dst, const uint8_t* top) { static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int A = top[0]; const int A = top[0];
const int B = top[1]; const int B = top[1];
const int C = top[2]; const int C = top[2];
@ -416,7 +436,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
DST(3, 3) = AVG3(G, H, H); DST(3, 3) = AVG3(G, H, H);
} }
static void VR4(uint8_t* dst, const uint8_t* top) { static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -438,7 +458,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
DST(3, 1) = AVG3(B, C, D); DST(3, 1) = AVG3(B, C, D);
} }
static void VL4(uint8_t* dst, const uint8_t* top) { static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int A = top[0]; const int A = top[0];
const int B = top[1]; const int B = top[1];
const int C = top[2]; const int C = top[2];
@ -460,7 +480,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
DST(3, 3) = AVG3(F, G, H); DST(3, 3) = AVG3(F, G, H);
} }
static void HU4(uint8_t* dst, const uint8_t* top) { static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
const int K = top[-4]; const int K = top[-4];
@ -475,7 +495,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
} }
static void HD4(uint8_t* dst, const uint8_t* top) { static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -498,7 +518,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J); DST(1, 3) = AVG3(L, K, J);
} }
static void TM4(uint8_t* dst, const uint8_t* top) { static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int x, y; int x, y;
const uint8_t* const clip = clip1 + 255 - top[-1]; const uint8_t* const clip = clip1 + 255 - top[-1];
for (y = 0; y < 4; ++y) { for (y = 0; y < 4; ++y) {
@ -516,7 +536,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are // Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7] // located at top[0..3], and top right is top[4..7]
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4(I4DC4 + dst, top); DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top); TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top); VE4(I4VE4 + dst, top);
@ -529,11 +550,14 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
HU4(I4HU4 + dst, top); HU4(I4HU4 + dst, top);
} }
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Metric // Metric
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b,
int w, int h) { int w, int h) {
int count = 0; int count = 0;
int y, x; int y, x;
@ -548,21 +572,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
return count; return count;
} }
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) { static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 16, 16); return GetSSE(a, b, 16, 16);
} }
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) { static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 16, 8); return GetSSE(a, b, 16, 8);
} }
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) { static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 8, 8); return GetSSE(a, b, 8, 8);
} }
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) { static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return GetSSE(a, b, 4, 4); return GetSSE(a, b, 4, 4);
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
int k, x, y; int k, x, y;
for (k = 0; k < 4; ++k) { for (k = 0; k < 4; ++k) {
uint32_t avg = 0; uint32_t avg = 0;
@ -586,7 +614,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform // Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients. // Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix. // w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* in, const uint16_t* w) { static int TTransform(const uint8_t* WEBP_RESTRICT in,
const uint16_t* WEBP_RESTRICT w) {
int sum = 0; int sum = 0;
int tmp[16]; int tmp[16];
int i; int i;
@ -620,15 +649,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum; return sum;
} }
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int sum1 = TTransform(a, w); const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w); const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5; return abs(sum2 - sum1) >> 5;
} }
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b, static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -644,13 +675,14 @@ static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
// Quantization // Quantization
// //
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static const uint8_t kZigzag[16] = { static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
}; };
// Simple quantization // Simple quantization
static int QuantizeBlock_C(int16_t in[16], int16_t out[16], static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int last = -1; int last = -1;
int n; int n;
for (n = 0; n < 16; ++n) { for (n = 0; n < 16; ++n) {
@ -675,9 +707,8 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
return (last >= 0); return (last >= 0);
} }
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -688,7 +719,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Block copy // Block copy
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst, int w, int h) {
int y; int y;
for (y = 0; y < h; ++y) { for (y = 0; y < h; ++y) {
memcpy(dst, src, w); memcpy(dst, src, w);
@ -697,11 +729,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
} }
} }
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) { static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst) {
Copy(src, dst, 4, 4); Copy(src, dst, 4, 4);
} }
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) { static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst) {
Copy(src, dst, 16, 8); Copy(src, dst, 16, 8);
} }
@ -760,14 +794,19 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C; VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C; VP8EncQuantize2Blocks = Quantize2Blocks_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
VP8EncPredLuma4 = Intra4Preds_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
VP8EncPredLuma16 = Intra16Preds_C;
#endif #endif
VP8FTransform2 = FTransform2_C; VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C; VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C; VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C; VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C; VP8Copy16x8 = Copy16x8_C;

View File

@ -109,9 +109,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms. // Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref,
const int16_t* in, const int16_t* WEBP_RESTRICT in,
uint8_t* dst) { uint8_t* WEBP_RESTRICT dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13; int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20; int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -141,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
); );
} }
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in, static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst, int do_two) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne_MIPS32(ref, in, dst); ITransformOne_MIPS32(ref, in, dst);
if (do_two) { if (do_two) {
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4); ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
@ -236,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
} }
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0; nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1; nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -358,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
"msub %[temp6], %[temp0] \n\t" \ "msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t" "msub %[temp7], %[temp1] \n\t"
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int tmp[32]; int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -393,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
#undef VERTICAL_PASS #undef VERTICAL_PASS
#undef HORIZONTAL_PASS #undef HORIZONTAL_PASS
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -475,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20; int temp17, temp18, temp19, temp20;
@ -537,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \ GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3) GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -571,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
return count; return count;
} }
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -597,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
return count; return count;
} }
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -619,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
return count; return count;
} }
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) { static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

View File

@ -141,8 +141,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const int c2217 = 2217; const int c2217 = 2217;
const int c5352 = 5352; const int c5352 = 5352;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
#undef VERTICAL_PASS #undef VERTICAL_PASS
#undef HORIZONTAL_PASS #undef HORIZONTAL_PASS
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
); );
} }
static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in, static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst, int do_two) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne(ref, in, dst); ITransformOne(ref, in, dst);
if (do_two) { if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4); ITransformOne(ref + 4, in + 16, dst + 4);
} }
} }
static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
return abs(temp3 - temp17) >> 5; return abs(temp3 - temp17) >> 5;
} }
static int Disto16x16_MIPSdspR2(const uint8_t* const a, static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
const uint8_t* const b, const uint8_t* WEBP_RESTRICT const b,
const uint16_t* const w) { const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a,
} while (0) } while (0)
#define VERTICAL_PRED(DST, TOP, SIZE) \ #define VERTICAL_PRED(DST, TOP, SIZE) \
static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \ static WEBP_INLINE void VerticalPred##SIZE( \
const uint8_t* (TOP)) { \ uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) { \
int j; \ int j; \
if ((TOP)) { \ if ((TOP)) { \
for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \ for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \
@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16)
#undef VERTICAL_PRED #undef VERTICAL_PRED
#define HORIZONTAL_PRED(DST, LEFT, SIZE) \ #define HORIZONTAL_PRED(DST, LEFT, SIZE) \
static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \ static WEBP_INLINE void HorizontalPred##SIZE( \
const uint8_t* (LEFT)) { \ uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) { \
if (LEFT) { \ if (LEFT) { \
int j; \ int j; \
for (j = 0; j < (SIZE); ++j) { \ for (j = 0; j < (SIZE); ++j) { \
@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16)
} while (0) } while (0)
#define TRUE_MOTION(DST, LEFT, TOP, SIZE) \ #define TRUE_MOTION(DST, LEFT, TOP, SIZE) \
static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\ static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST), \
const uint8_t* (TOP)) { \ const uint8_t* WEBP_RESTRICT (LEFT), \
const uint8_t* WEBP_RESTRICT (TOP)) { \
if ((LEFT) != NULL) { \ if ((LEFT) != NULL) { \
if ((TOP) != NULL) { \ if ((TOP) != NULL) { \
CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \
@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16)
#undef CLIP_8B_TO_DST #undef CLIP_8B_TO_DST
#undef CLIPPING #undef CLIPPING
static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
int DC, DC1; int DC, DC1;
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
FILL_8_OR_16(dst, DC, 16); FILL_8_OR_16(dst, DC, 16);
} }
static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
int DC, DC1; int DC, DC1;
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
FILL_8_OR_16(dst, DC, 8); FILL_8_OR_16(dst, DC, 8);
} }
static void DC4(uint8_t* dst, const uint8_t* top) { static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1; int temp0, temp1;
__asm__ volatile( __asm__ volatile(
"ulw %[temp0], 0(%[top]) \n\t" "ulw %[temp0], 0(%[top]) \n\t"
@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void TM4(uint8_t* dst, const uint8_t* top) { static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5; int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
const int c35 = 0xff00ff; const int c35 = 0xff00ff;
__asm__ volatile ( __asm__ volatile (
@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void VE4(uint8_t* dst, const uint8_t* top) { static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile( __asm__ volatile(
"ulw %[temp0], -1(%[top]) \n\t" "ulw %[temp0], -1(%[top]) \n\t"
@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void HE4(uint8_t* dst, const uint8_t* top) { static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile( __asm__ volatile(
"ulw %[temp0], -4(%[top]) \n\t" "ulw %[temp0], -4(%[top]) \n\t"
@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void RD4(uint8_t* dst, const uint8_t* top) { static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5; int temp0, temp1, temp2, temp3, temp4, temp5;
int temp6, temp7, temp8, temp9, temp10, temp11; int temp6, temp7, temp8, temp9, temp10, temp11;
__asm__ volatile( __asm__ volatile(
@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void VR4(uint8_t* dst, const uint8_t* top) { static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4; int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9; int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile ( __asm__ volatile (
@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void LD4(uint8_t* dst, const uint8_t* top) { static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5; int temp0, temp1, temp2, temp3, temp4, temp5;
int temp6, temp7, temp8, temp9, temp10, temp11; int temp6, temp7, temp8, temp9, temp10, temp11;
__asm__ volatile( __asm__ volatile(
@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void VL4(uint8_t* dst, const uint8_t* top) { static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4; int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9; int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile ( __asm__ volatile (
@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void HD4(uint8_t* dst, const uint8_t* top) { static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4; int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9; int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile ( __asm__ volatile (
@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
); );
} }
static void HU4(uint8_t* dst, const uint8_t* top) { static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile ( __asm__ volatile (
"ulw %[temp0], -5(%[top]) \n\t" "ulw %[temp0], -5(%[top]) \n\t"
@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2) // Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block // U block
DCMode8(C8DC8 + dst, left, top); DCMode8(C8DC8 + dst, left, top);
VerticalPred8(C8VE8 + dst, top); VerticalPred8(C8VE8 + dst, top);
@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3) // luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_MIPSdspR2(uint8_t* dst, static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* left, const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode16(I16DC16 + dst, left, top); DCMode16(I16DC16 + dst, left, top);
VerticalPred16(I16VE16 + dst, top); VerticalPred16(I16VE16 + dst, top);
HorizontalPred16(I16HE16 + dst, left); HorizontalPred16(I16HE16 + dst, left);
@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst,
// Left samples are top[-5 .. -2], top_left is top[-1], top are // Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7] // located at top[0..3], and top right is top[4..7]
static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4(I4DC4 + dst, top); DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top); TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top); VE4(I4VE4 + dst, top);
@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
GET_SSE_INNER(C) \ GET_SSE_INNER(C) \
GET_SSE_INNER(D) GET_SSE_INNER(D)
static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
__asm__ volatile ( __asm__ volatile (
@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
return count; return count;
} }
static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
__asm__ volatile ( __asm__ volatile (
@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
return count; return count;
} }
static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
__asm__ volatile ( __asm__ volatile (
@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
return count; return count;
} }
static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
int count; int count;
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
__asm__ volatile ( __asm__ volatile (
@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
"3: \n\t" "3: \n\t"
static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5,temp6; int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
int sign, coeff, level; int sign, coeff, level;
int max_level = MAX_LEVEL; int max_level = MAX_LEVEL;
@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
} }
static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0; nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1; nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
"usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
"usw %[" #TEMP6 "], " #D "(%[out]) \n\t" "usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) { static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int temp0, temp1, temp2, temp3, temp4; int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9; int temp5, temp6, temp7, temp8, temp9;

View File

@ -41,8 +41,9 @@
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
} while (0) } while (0)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
v8i16 input0, input1; v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3; v4i32 res0, res1, res2, res3;
@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
} }
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst, static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref,
int do_two) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne(ref, in, dst); ITransformOne(ref, in, dst);
if (do_two) { if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4); ITransformOne(ref + 4, in + 16, dst + 4);
} }
} }
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
uint64_t out0, out1, out2, out3; uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3; uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
SD4(out0, out1, out2, out3, out, 8); SD4(out0, out1, out2, out3, out, 8);
} }
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
v8i16 in0 = { 0 }; v8i16 in0 = { 0 };
v8i16 in1 = { 0 }; v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3; v8i16 tmp0, tmp1, tmp2, tmp3;
@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
ST_SH2(out0, out1, out, 8); ST_SH2(out0, out1, out, 8);
} }
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in,
const uint16_t* WEBP_RESTRICT w) {
int sum; int sum;
uint32_t in0_m, in1_m, in2_m, in3_m; uint32_t in0_m, in1_m, in2_m, in3_m;
v16i8 src0 = { 0 }; v16i8 src0 = { 0 };
@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
return sum; return sum;
} }
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int sum1 = TTransform_MSA(a, w); const int sum1 = TTransform_MSA(a, w);
const int sum2 = TTransform_MSA(b, w); const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5; return abs(sum2 - sum1) >> 5;
} }
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b, static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1) #define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical // vertical
static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16u8 A1 = { 0 }; const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1); const uint64_t val_m = LD(top - 1);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
SW4(out, out, out, out, dst, BPS); SW4(out, out, out, out, dst, BPS);
} }
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal // horizontal
static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
} }
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
uint32_t dc = 4; uint32_t dc = 4;
int i; int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
SW4(dc, dc, dc, dc, dst, BPS); SW4(dc, dc, dc, dc, dst, BPS);
} }
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16u8 A2 = { 0 }; const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5); const uint64_t val_m = LD(top - 5);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m); const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
SW4(val3, val2, val1, val0, dst, BPS); SW4(val3, val2, val1, val0, dst, BPS);
} }
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16u8 A1 = { 0 }; const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top); const uint64_t val_m = LD(top);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
SW4(val0, val1, val2, val3, dst, BPS); SW4(val0, val1, val2, val3, dst, BPS);
} }
static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
DST(3, 1) = AVG3(B, C, D); DST(3, 1) = AVG3(B, C, D);
} }
static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int A = top[0]; const int A = top[0];
const int B = top[1]; const int B = top[1];
const int C = top[2]; const int C = top[2];
@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
DST(3, 3) = AVG3(F, G, H); DST(3, 3) = AVG3(F, G, H);
} }
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
const int K = top[-4]; const int K = top[-4];
@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
} }
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J); DST(1, 3) = AVG3(L, K, J);
} }
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const v16i8 zero = { 0 }; const v16i8 zero = { 0 };
const v8i16 TL = (v8i16)__msa_fill_h(top[-1]); const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]); const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
#undef AVG3 #undef AVG3
#undef AVG2 #undef AVG2
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4(I4DC4 + dst, top); DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top); TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top); VE4(I4VE4 + dst, top);
@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \ ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
} while (0) } while (0)
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) { if (top != NULL) {
const v16u8 out = LD_UB(top); const v16u8 out = LD_UB(top);
STORE16x16(out, dst); STORE16x16(out, dst);
@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
} }
} }
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* left) { const uint8_t* WEBP_RESTRICT left) {
if (left != NULL) { if (left != NULL) {
int j; int j;
for (j = 0; j < 16; j += 4) { for (j = 0; j < 16; j += 4) {
@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
} }
} }
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (left != NULL) { if (left != NULL) {
if (top != NULL) { if (top != NULL) {
int j; int j;
@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
} }
} }
static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
int DC; int DC;
v16u8 out; v16u8 out;
if (top != NULL && left != NULL) { if (top != NULL && left != NULL) {
@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
STORE16x16(out, dst); STORE16x16(out, dst);
} }
static void Intra16Preds_MSA(uint8_t* dst, static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst,
const uint8_t* left, const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode16x16(I16DC16 + dst, left, top); DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top); VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left); HorizontalPred16x16(I16HE16 + dst, left);
@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst,
SD4(out, out, out, out, dst + 4 * BPS, BPS); \ SD4(out, out, out, out, dst + 4 * BPS, BPS); \
} while (0) } while (0)
static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) { if (top != NULL) {
const uint64_t out = LD(top); const uint64_t out = LD(top);
STORE8x8(out, dst); STORE8x8(out, dst);
@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
} }
} }
static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
if (left != NULL) { if (left != NULL) {
int j; int j;
for (j = 0; j < 8; j += 4) { for (j = 0; j < 8; j += 4) {
@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
} }
} }
static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (left != NULL) { if (left != NULL) {
if (top != NULL) { if (top != NULL) {
int j; int j;
@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
} }
} }
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
uint64_t out; uint64_t out;
v16u8 src = { 0 }; v16u8 src = { 0 };
if (top != NULL && left != NULL) { if (top != NULL && left != NULL) {
@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
STORE8x8(out, dst); STORE8x8(out, dst);
} }
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block // U block
DCMode8x8(C8DC8 + dst, left, top); DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top); VerticalPred8x8(C8VE8 + dst, top);
@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0) } while (0)
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum; uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
return sum; return sum;
} }
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum; uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
return sum; return sum;
} }
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum; uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
return sum; return sum;
} }
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32_t sum = 0; uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1; v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
// Quantization // Quantization
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int sum; int sum;
v8i16 in0, in1, sh0, sh1, out0, out1; v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
} }
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32], static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;

View File

@ -60,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23, const int16x8_t row23,
const uint8_t* const ref, const uint8_t* WEBP_RESTRICT const ref,
uint8_t* const dst) { uint8_t* WEBP_RESTRICT const dst) {
uint32x2_t dst01 = vdup_n_u32(0); uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0); uint32x2_t dst23 = vdup_n_u32(0);
@ -120,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
Transpose8x2_NEON(E0, E1, rows); Transpose8x2_NEON(E0, E1, rows);
} }
static void ITransformOne_NEON(const uint8_t* ref, static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
const int16_t* in, uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int16x8x2_t rows; int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows); TransformPass_NEON(&rows);
@ -131,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
#else #else
static void ITransformOne_NEON(const uint8_t* ref, static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
const int16_t* in, uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int kBPS = BPS; const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
@ -247,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
#endif // WEBP_USE_INTRINSICS #endif // WEBP_USE_INTRINSICS
static void ITransform_NEON(const uint8_t* ref, static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
const int16_t* in, uint8_t* dst, int do_two) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
ITransformOne_NEON(ref, in, dst); ITransformOne_NEON(ref, in, dst);
if (do_two) { if (do_two) {
ITransformOne_NEON(ref + 4, in + 16, dst + 4); ITransformOne_NEON(ref + 4, in + 16, dst + 4);
@ -294,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
return vreinterpretq_s16_u16(vsubl_u8(a, b)); return vreinterpretq_s16_u16(vsubl_u8(a, b));
} }
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{ {
const uint8x16_t S0 = Load4x4_NEON(src); const uint8x16_t S0 = Load4x4_NEON(src);
@ -364,8 +368,9 @@ static const int32_t kCoeff32[] = {
51000, 51000, 51000, 51000 51000, 51000, 51000, 51000
}; };
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const int kBPS = BPS; const int kBPS = BPS;
const uint8_t* src_ptr = src; const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref; const uint8_t* ref_ptr = ref;
@ -484,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
src += stride; \ src += stride; \
} while (0) } while (0)
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) { static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
int16_t* WEBP_RESTRICT out) {
const int stride = 16; const int stride = 16;
const int16x4_t zero = vdup_n_s16(0); const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0; int32x4x4_t tmp0;
@ -659,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
// Hadamard transform // Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients. // Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix. // w[] contains a row-major 4 by 4 symmetric matrix.
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0); uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0); uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0); uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -701,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
} }
#undef LOAD_LANE_32b #undef LOAD_LANE_32b
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -715,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block, int start_block, int end_block,
VP8Histogram* const histo) { VP8Histogram* WEBP_RESTRICT const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j; int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 }; int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -747,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, static WEBP_INLINE void AccumulateSSE16_NEON(
const uint8_t* const b, const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
uint32x4_t* const sum) { uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a); const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b); const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0); const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -775,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) {
#endif #endif
} }
static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32x4_t sum = vdupq_n_u32(0); uint32x4_t sum = vdupq_n_u32(0);
int y; int y;
for (y = 0; y < 16; ++y) { for (y = 0; y < 16; ++y) {
@ -784,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
return SumToInt_NEON(sum); return SumToInt_NEON(sum);
} }
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32x4_t sum = vdupq_n_u32(0); uint32x4_t sum = vdupq_n_u32(0);
int y; int y;
for (y = 0; y < 8; ++y) { for (y = 0; y < 8; ++y) {
@ -793,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
return SumToInt_NEON(sum); return SumToInt_NEON(sum);
} }
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
uint32x4_t sum = vdupq_n_u32(0); uint32x4_t sum = vdupq_n_u32(0);
int y; int y;
for (y = 0; y < 8; ++y) { for (y = 0; y < 8; ++y) {
@ -806,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
return SumToInt_NEON(sum); return SumToInt_NEON(sum);
} }
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
const uint8x16_t a0 = Load4x4_NEON(a); const uint8x16_t a0 = Load4x4_NEON(a);
const uint8x16_t b0 = Load4x4_NEON(b); const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0); const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -825,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
// Compilation with gcc-4.6.x is problematic for now. // Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC) #if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize_NEON(int16_t* const in, static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
const VP8Matrix* const mtx, int offset) { const VP8Matrix* WEBP_RESTRICT const mtx,
int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -860,7 +874,7 @@ static const uint8_t kShuffles[4][8] = {
}; };
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
const int16x8_t out0 = Quantize_NEON(in, mtx, 0); const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
const int16x8_t out1 = Quantize_NEON(in, mtx, 8); const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles; uint8x8x4_t shuffles;
@ -902,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
} }
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0; nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1; nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -911,6 +925,271 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
#endif // !WORK_AROUND_GCC #endif // !WORK_AROUND_GCC
#if WEBP_AARCH64
#if BPS == 32
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
do { \
uint8x16_t r; \
r = vqtbl2q_u8(qcombined, tbl); \
r = vreinterpretq_u8_u32( \
vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \
vreinterpretq_u32_u8(r), 1)); \
vst1q_u8(dst, r); \
} while (0)
#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \
do { \
uint8x16_t r; \
r = vqtbl2q_u8(qcombined, tbl); \
vst1q_u8(dst, r); \
} while (0)
static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
// L K J I X A B C D E F G H
// -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
static const uint8_t kLookupTbl1[64] = {
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,
3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,
4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,
2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31
};
static const uint8_t kLookupTbl2[64] = {
20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,
19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,
17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
};
static const uint8_t kLookupTbl3[64] = {
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16
};
const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
const uint8x16_t preload = vld1q_u8(top - 5);
uint8x16x2_t qcombined;
uint8x16_t result0, result1;
uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
uint8x16_t b = preload;
uint8x16_t c = vextq_u8(a, a, 2);
uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
uint8x16_t avg2_all = vrhaddq_u8(a, b);
uint8x8_t preload_x8, sub_a, sub_c;
uint8_t result_u8;
uint8x8_t res_lo, res_hi;
uint8x16_t full_b;
uint16x8_t sub, sum_lo, sum_hi;
preload_x8 = vget_low_u8(c);
preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
qcombined.val[0] = avg2_all;
qcombined.val[1] = avg3_all;
sub_a = vdup_laneq_u8(preload, 4);
// preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
// preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
sub_c = vreinterpret_u8_u32(vdup_n_u32(
vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
sub = vsubl_u8(sub_c, sub_a);
sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
// DC4, VE4, HE4, TM4
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
// RD4, VR4, LD4, VL4
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
// HD4, HU4
result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
}
#endif // BPS == 32
static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
uint8x16_t a = vdupq_n_u8(value);
int i;
for (i = 0; i < 16; i++) {
vst1q_u8(dst + BPS * i, a);
}
}
static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
uint8x16_t a = vld1q_u8(src);
int i;
for (i = 0; i < 16; i++) {
vst1q_u8(dst + BPS * i, a);
}
}
static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
const uint8_t* left) {
uint8x16_t a;
if (left == NULL) {
Fill_NEON(dst, 129);
return;
}
a = vld1q_u8(left + 0);
vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
}
static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
if (top != NULL) {
Fill16_NEON(dst, top);
} else {
Fill_NEON(dst, 127);
}
}
static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint8_t s;
if (top != NULL) {
uint16_t dc;
dc = vaddlvq_u8(vld1q_u8(top));
if (left != NULL) {
// top and left present.
dc += vaddlvq_u8(vld1q_u8(left));
s = vqrshrnh_n_u16(dc, 5);
} else {
// top but no left.
s = vqrshrnh_n_u16(dc, 4);
}
} else {
if (left != NULL) {
uint16_t dc;
// left but no top.
dc = vaddlvq_u8(vld1q_u8(left));
s = vqrshrnh_n_u16(dc, 4);
} else {
// No top, no left, nothing.
s = 0x80;
}
}
Fill_NEON(dst, s);
}
static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
const uint8x8_t outer,
const uint8x8x2_t inner,
const uint16x8_t a, int i,
const int n) {
uint8x8_t d1, d2;
uint16x8_t r1, r2;
r1 = vaddl_u8(outer, inner.val[0]);
r1 = vqsubq_u16(r1, a);
d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
r2 = vaddl_u8(outer, inner.val[1]);
r2 = vqsubq_u16(r2, a);
d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
vst1_u8(dst + BPS * (i * 4 + n), d1);
vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
}
static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
int i;
uint16x8_t a;
uint8x8x2_t inner;
if (left == NULL) {
// True motion without left samples (hence: with default 129 value) is
// equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is then
// 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred16_NEON(dst, top);
} else {
Fill_NEON(dst, 129);
}
return;
}
// left is not NULL.
if (top == NULL) {
HorizontalPred16_NEON(dst, left);
return;
}
// Neither left nor top are NULL.
a = vdupq_n_u16(left[-1]);
inner = vld1_u8_x2(top);
for (i = 0; i < 4; i++) {
const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
}
}
static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DCMode_NEON(I16DC16 + dst, left, top);
VerticalPred16_NEON(I16VE16 + dst, top);
HorizontalPred16_NEON(I16HE16 + dst, left);
TrueMotion_NEON(I16TM16 + dst, left, top);
}
#endif // WEBP_AARCH64
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Entry point // Entry point
@ -931,9 +1210,17 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8SSE8x8 = SSE8x8_NEON; VP8SSE8x8 = SSE8x8_NEON;
VP8SSE4x4 = SSE4x4_NEON; VP8SSE4x4 = SSE4x4_NEON;
#if WEBP_AARCH64
#if BPS == 32
VP8EncPredLuma4 = Intra4Preds_NEON;
#endif
VP8EncPredLuma16 = Intra16Preds_NEON;
#endif
#if !defined(WORK_AROUND_GCC) #if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock_NEON; VP8EncQuantizeBlock = QuantizeBlock_NEON;
VP8EncQuantize2Blocks = Quantize2Blocks_NEON; VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
#endif #endif
} }

View File

@ -26,8 +26,9 @@
// Transforms (Paragraph 14.4) // Transforms (Paragraph 14.4)
// Does one inverse transform. // Does one inverse transform.
static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in, static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
// This implementation makes use of 16-bit fixed point versions of two // This implementation makes use of 16-bit fixed point versions of two
// multiply constants: // multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -177,8 +178,9 @@ static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
} }
// Does two inverse transforms. // Does two inverse transforms.
static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in, static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref,
uint8_t* dst) { const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
// This implementation makes use of 16-bit fixed point versions of two // This implementation makes use of 16-bit fixed point versions of two
// multiply constants: // multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -316,7 +318,9 @@ static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
} }
// Does one or two inverse transforms. // Does one or two inverse transforms.
static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref,
const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst,
int do_two) { int do_two) {
if (do_two) { if (do_two) {
ITransform_Two_SSE2(ref, in, dst); ITransform_Two_SSE2(ref, in, dst);
@ -373,7 +377,7 @@ static void FTransformPass1_SSE2(const __m128i* const in01,
static void FTransformPass2_SSE2(const __m128i* const v01, static void FTransformPass2_SSE2(const __m128i* const v01,
const __m128i* const v32, const __m128i* const v32,
int16_t* out) { int16_t* WEBP_RESTRICT out) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7); const __m128i seven = _mm_set1_epi16(7);
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
@ -424,8 +428,9 @@ static void FTransformPass2_SSE2(const __m128i* const v01,
_mm_storeu_si128((__m128i*)&out[8], d2_f3); _mm_storeu_si128((__m128i*)&out[8], d2_f3);
} }
static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
// Load src. // Load src.
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@ -468,8 +473,9 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
FTransformPass2_SSE2(&v01, &v32, out); FTransformPass2_SSE2(&v01, &v32, out);
} }
static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src,
int16_t* out) { const uint8_t* WEBP_RESTRICT ref,
int16_t* WEBP_RESTRICT out) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
// Load src and convert to 16b. // Load src and convert to 16b.
@ -517,7 +523,8 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
FTransformPass2_SSE2(&v01h, &v32h, out + 16); FTransformPass2_SSE2(&v01h, &v32h, out + 16);
} }
static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in,
__m128i* const out) {
const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@ -533,7 +540,8 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
*out = _mm_madd_epi16(D, kMult); *out = _mm_madd_epi16(D, kMult);
} }
static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
// Input is 12b signed. // Input is 12b signed.
__m128i row0, row1, row2, row3; __m128i row0, row1, row2, row3;
// Rows are 14b signed. // Rows are 14b signed.
@ -566,9 +574,10 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
// Compute susceptibility based on DCT-coeff histograms: // Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress. // the higher, the "easier" the macroblock is to compress.
static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block, int start_block, int end_block,
VP8Histogram* const histo) { VP8Histogram* WEBP_RESTRICT const histo) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j; int j;
@ -640,7 +649,8 @@ static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
} }
} }
static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
int j; int j;
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
@ -648,7 +658,8 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
} }
} }
static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_values = _mm_load_si128((const __m128i*)top); const __m128i top_values = _mm_load_si128((const __m128i*)top);
int j; int j;
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
@ -656,8 +667,9 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
} }
} }
static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top, int size) { const uint8_t* WEBP_RESTRICT top,
int size) {
if (top != NULL) { if (top != NULL) {
if (size == 8) { if (size == 8) {
VE8uv_SSE2(dst, top); VE8uv_SSE2(dst, top);
@ -669,7 +681,8 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
} }
} }
static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
int j; int j;
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8((char)left[j]); const __m128i values = _mm_set1_epi8((char)left[j]);
@ -678,7 +691,8 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
} }
} }
static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
int j; int j;
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
const __m128i values = _mm_set1_epi8((char)left[j]); const __m128i values = _mm_set1_epi8((char)left[j]);
@ -687,8 +701,9 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
} }
} }
static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* left, int size) { const uint8_t* WEBP_RESTRICT left,
int size) {
if (left != NULL) { if (left != NULL) {
if (size == 8) { if (size == 8) {
HE8uv_SSE2(dst, left); HE8uv_SSE2(dst, left);
@ -700,8 +715,9 @@ static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
} }
} }
static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top, int size) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top, int size) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
int y; int y;
if (size == 8) { if (size == 8) {
@ -728,8 +744,10 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
} }
} }
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top, int size) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top,
int size) {
if (left != NULL) { if (left != NULL) {
if (top != NULL) { if (top != NULL) {
TM_SSE2(dst, left, top, size); TM_SSE2(dst, left, top, size);
@ -749,8 +767,9 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
} }
} }
static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
@ -758,7 +777,8 @@ static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
Put8x8uv_SSE2(DC >> 4, dst); Put8x8uv_SSE2(DC >> 4, dst);
} }
static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i sum = _mm_sad_epu8(top_values, zero); const __m128i sum = _mm_sad_epu8(top_values, zero);
@ -766,7 +786,8 @@ static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
Put8x8uv_SSE2(DC >> 3, dst); Put8x8uv_SSE2(DC >> 3, dst);
} }
static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
// 'left' is contiguous so we can reuse the top summation. // 'left' is contiguous so we can reuse the top summation.
DC8uvNoLeft_SSE2(dst, left); DC8uvNoLeft_SSE2(dst, left);
} }
@ -775,8 +796,9 @@ static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
Put8x8uv_SSE2(0x80, dst); Put8x8uv_SSE2(0x80, dst);
} }
static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) { if (top != NULL) {
if (left != NULL) { // top and left present if (left != NULL) { // top and left present
DC8uv_SSE2(dst, left, top); DC8uv_SSE2(dst, left, top);
@ -790,8 +812,9 @@ static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
} }
} }
static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i top_row = _mm_load_si128((const __m128i*)top);
const __m128i left_row = _mm_load_si128((const __m128i*)left); const __m128i left_row = _mm_load_si128((const __m128i*)left);
const int DC = const int DC =
@ -799,13 +822,15 @@ static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
Put16_SSE2(DC >> 5, dst); Put16_SSE2(DC >> 5, dst);
} }
static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i top_row = _mm_load_si128((const __m128i*)top);
const int DC = VP8HorizontalAdd8b(&top_row) + 8; const int DC = VP8HorizontalAdd8b(&top_row) + 8;
Put16_SSE2(DC >> 4, dst); Put16_SSE2(DC >> 4, dst);
} }
static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT left) {
// 'left' is contiguous so we can reuse the top summation. // 'left' is contiguous so we can reuse the top summation.
DC16NoLeft_SSE2(dst, left); DC16NoLeft_SSE2(dst, left);
} }
@ -814,8 +839,9 @@ static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
Put16_SSE2(0x80, dst); Put16_SSE2(0x80, dst);
} }
static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
if (top != NULL) { if (top != NULL) {
if (left != NULL) { // top and left present if (left != NULL) { // top and left present
DC16_SSE2(dst, left, top); DC16_SSE2(dst, left, top);
@ -844,8 +870,9 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
static WEBP_INLINE void VE4_SSE2(uint8_t* dst, // vertical
const uint8_t* top) { // vertical static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1); const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -861,8 +888,9 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
} }
} }
static WEBP_INLINE void HE4_SSE2(uint8_t* dst, // horizontal
const uint8_t* top) { // horizontal static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -874,15 +902,17 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
} }
static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
uint32_t dc = 4; uint32_t dc = 4;
int i; int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill_SSE2(dst, dc >> 3, 4); Fill_SSE2(dst, dc >> 3, 4);
} }
static WEBP_INLINE void LD4_SSE2(uint8_t* dst, // Down-Left
const uint8_t* top) { // Down-Left static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1); const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -898,8 +928,9 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
} }
static WEBP_INLINE void VR4_SSE2(uint8_t* dst, // Vertical-Right
const uint8_t* top) { // Vertical-Right static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1); const __m128i one = _mm_set1_epi8(1);
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -924,8 +955,9 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
DST(0, 3) = AVG3(K, J, I); DST(0, 3) = AVG3(K, J, I);
} }
static WEBP_INLINE void VL4_SSE2(uint8_t* dst, // Vertical-Left
const uint8_t* top) { // Vertical-Left static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1); const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -951,8 +983,9 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
DST(3, 3) = (extra_out >> 8) & 0xff; DST(3, 3) = (extra_out >> 8) & 0xff;
} }
static WEBP_INLINE void RD4_SSE2(uint8_t* dst, // Down-right
const uint8_t* top) { // Down-right static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i one = _mm_set1_epi8(1); const __m128i one = _mm_set1_epi8(1);
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@ -968,7 +1001,8 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
} }
static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
const int K = top[-4]; const int K = top[-4];
@ -983,7 +1017,8 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
} }
static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const int X = top[-1]; const int X = top[-1];
const int I = top[-2]; const int I = top[-2];
const int J = top[-3]; const int J = top[-3];
@ -1006,7 +1041,8 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J); DST(1, 3) = AVG3(L, K, J);
} }
static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@ -1028,7 +1064,8 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are // Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7] // located at top[0..3], and top right is top[4..7]
static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* WEBP_RESTRICT top) {
DC4_SSE2(I4DC4 + dst, top); DC4_SSE2(I4DC4 + dst, top);
TM4_SSE2(I4TM4 + dst, top); TM4_SSE2(I4TM4 + dst, top);
VE4_SSE2(I4VE4 + dst, top); VE4_SSE2(I4VE4 + dst, top);
@ -1044,8 +1081,9 @@ static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2) // Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
// U block // U block
DC8uvMode_SSE2(C8DC8 + dst, left, top); DC8uvMode_SSE2(C8DC8 + dst, left, top);
VerticalPred_SSE2(C8VE8 + dst, top, 8); VerticalPred_SSE2(C8VE8 + dst, top, 8);
@ -1064,8 +1102,9 @@ static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3) // luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_SSE2(uint8_t* dst, static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
const uint8_t* left, const uint8_t* top) { const uint8_t* WEBP_RESTRICT left,
const uint8_t* WEBP_RESTRICT top) {
DC16Mode_SSE2(I16DC16 + dst, left, top); DC16Mode_SSE2(I16DC16 + dst, left, top);
VerticalPred_SSE2(I16VE16 + dst, top, 16); VerticalPred_SSE2(I16VE16 + dst, top, 16);
HorizontalPred_SSE2(I16HE16 + dst, left, 16); HorizontalPred_SSE2(I16HE16 + dst, left, 16);
@ -1092,7 +1131,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
*sum = _mm_add_epi32(sum1, sum2); *sum = _mm_add_epi32(sum1, sum2);
} }
static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b,
int num_pairs) { int num_pairs) {
__m128i sum = _mm_setzero_si128(); __m128i sum = _mm_setzero_si128();
int32_t tmp[4]; int32_t tmp[4];
@ -1114,18 +1154,21 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
} }
static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) { static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return SSE_16xN_SSE2(a, b, 8); return SSE_16xN_SSE2(a, b, 8);
} }
static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) { static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
return SSE_16xN_SSE2(a, b, 4); return SSE_16xN_SSE2(a, b, 4);
} }
#define LOAD_8x16b(ptr) \ #define LOAD_8x16b(ptr) \
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
int num_pairs = 4; int num_pairs = 4;
__m128i sum = zero; __m128i sum = zero;
@ -1152,7 +1195,8 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
} }
#undef LOAD_8x16b #undef LOAD_8x16b
static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT b) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
// Load values. Note that we read 8 pixels instead of 4, // Load values. Note that we read 8 pixels instead of 4,
@ -1189,7 +1233,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
const __m128i mask = _mm_set1_epi16(0x00ff); const __m128i mask = _mm_set1_epi16(0x00ff);
const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@ -1227,8 +1271,9 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform // Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients. // Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix. // w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT inB,
const uint16_t* WEBP_RESTRICT const w) {
int32_t sum[4]; int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3; __m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
@ -1328,14 +1373,16 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3]; return sum[0] + sum[1] + sum[2] + sum[3];
} }
static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int diff_sum = TTransform_SSE2(a, b, w); const int diff_sum = TTransform_SSE2(a, b, w);
return abs(diff_sum) >> 5; return abs(diff_sum) >> 5;
} }
static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -1350,9 +1397,10 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
// Quantization // Quantization
// //
static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], static WEBP_INLINE int DoQuantizeBlock_SSE2(
const uint16_t* const sharpen, int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const uint16_t* WEBP_RESTRICT const sharpen,
const VP8Matrix* WEBP_RESTRICT const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8; __m128i coeff0, coeff8;
@ -1463,17 +1511,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
} }
static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16], static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx); return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
} }
static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16], static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE2(in, out, NULL, mtx); return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
} }
static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32], static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0]; const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;

View File

@ -23,9 +23,10 @@
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms. // Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred, static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref,
const uint8_t* WEBP_RESTRICT pred,
int start_block, int end_block, int start_block, int end_block,
VP8Histogram* const histo) { VP8Histogram* WEBP_RESTRICT const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j; int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 }; int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -168,14 +169,16 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3]; return sum[0] + sum[1] + sum[2] + sum[3];
} }
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b, static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
const int diff_sum = TTransform_SSE41(a, b, w); const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5; return abs(diff_sum) >> 5;
} }
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b, static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a,
const uint16_t* const w) { const uint8_t* WEBP_RESTRICT const b,
const uint16_t* WEBP_RESTRICT const w) {
int D = 0; int D = 0;
int x, y; int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -301,17 +304,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
#undef PSHUFB_CST #undef PSHUFB_CST
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16], static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx); return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
} }
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16], static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
return DoQuantizeBlock_SSE41(in, out, NULL, mtx); return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
} }
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32], static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) { const VP8Matrix* WEBP_RESTRICT const mtx) {
int nz; int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0]; const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;

View File

@ -23,55 +23,42 @@
do { \ do { \
assert((in) != NULL); \ assert((in) != NULL); \
assert((out) != NULL); \ assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \ assert(width > 0); \
assert(height > 0); \ assert(height > 0); \
assert(stride >= width); \ assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0) } while (0)
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred, static WEBP_INLINE void PredictLine_C(const uint8_t* WEBP_RESTRICT src,
uint8_t* dst, int length, int inverse) { const uint8_t* WEBP_RESTRICT pred,
uint8_t* WEBP_RESTRICT dst, int length) {
int i; int i;
if (inverse) { for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
} else {
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
}
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Horizontal filter. // Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in, static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* WEBP_RESTRICT out) {
int inverse, uint8_t* out) { const uint8_t* preds = in;
const uint8_t* preds; int row;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) { // Leftmost pixel is the same as input for topmost scanline.
// Leftmost pixel is the same as input for topmost scanline. out[0] = in[0];
out[0] = in[0]; PredictLine_C(in + 1, preds, out + 1, width - 1);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); preds += stride;
row = 1; in += stride;
preds += stride; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
// Leftmost pixel is predicted from above. // Leftmost pixel is predicted from above.
PredictLine_C(in, preds - stride, out, 1, inverse); PredictLine_C(in, preds - stride, out, 1);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); PredictLine_C(in + 1, preds, out + 1, width - 1);
++row;
preds += stride; preds += stride;
in += stride; in += stride;
out += stride; out += stride;
@ -81,35 +68,23 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Vertical filter. // Vertical filter.
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in, static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* WEBP_RESTRICT out) {
int inverse, uint8_t* out) { const uint8_t* preds = in;
const uint8_t* preds; int row;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) { // Very first top-left pixel is copied.
// Very first top-left pixel is copied. out[0] = in[0];
out[0] = in[0]; // Rest of top scan-line is left-predicted.
// Rest of top scan-line is left-predicted. PredictLine_C(in + 1, preds, out + 1, width - 1);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
PredictLine_C(in, preds, out, width, inverse); PredictLine_C(in, preds, out, width);
++row;
preds += stride; preds += stride;
in += stride; in += stride;
out += stride; out += stride;
@ -126,40 +101,31 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
} }
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, static WEBP_INLINE void DoGradientFilter_C(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* WEBP_RESTRICT out) {
int inverse, uint8_t* out) { const uint8_t* preds = in;
const uint8_t* preds; int row;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
// left prediction for top scan-line // left prediction for top scan-line
if (row == 0) { out[0] = in[0];
out[0] = in[0]; PredictLine_C(in + 1, preds, out + 1, width - 1);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); preds += stride;
row = 1; in += stride;
preds += stride; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
int w; int w;
// leftmost pixel: predict from above. // leftmost pixel: predict from above.
PredictLine_C(in, preds - stride, out, 1, inverse); PredictLine_C(in, preds - stride, out, 1);
for (w = 1; w < width; ++w) { for (w = 1; w < width; ++w) {
const int pred = GradientPredictor_C(preds[w - 1], const int pred = GradientPredictor_C(preds[w - 1],
preds[w - stride], preds[w - stride],
preds[w - stride - 1]); preds[w - stride - 1]);
out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred)); out[w] = (uint8_t)(in[w] - pred);
} }
++row;
preds += stride; preds += stride;
in += stride; in += stride;
out += stride; out += stride;
@ -172,20 +138,22 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE
static void HorizontalFilter_C(const uint8_t* data, int width, int height, static void HorizontalFilter_C(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoHorizontalFilter_C(data, width, height, stride, 0, height, 0, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoHorizontalFilter_C(data, width, height, stride, filtered_data);
} }
static void VerticalFilter_C(const uint8_t* data, int width, int height, static void VerticalFilter_C(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data); uint8_t* WEBP_RESTRICT filtered_data) {
DoVerticalFilter_C(data, width, height, stride, filtered_data);
} }
static void GradientFilter_C(const uint8_t* data, int width, int height, static void GradientFilter_C(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data); uint8_t* WEBP_RESTRICT filtered_data) {
DoGradientFilter_C(data, width, height, stride, filtered_data);
} }
#endif // !WEBP_NEON_OMIT_C_CODE #endif // !WEBP_NEON_OMIT_C_CODE

View File

@ -26,13 +26,12 @@
#define DCHECK(in, out) \ #define DCHECK(in, out) \
do { \ do { \
assert(in != NULL); \ assert((in) != NULL); \
assert(out != NULL); \ assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \ assert(width > 0); \
assert(height > 0); \ assert(height > 0); \
assert(stride >= width); \ assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0) } while (0)
#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \ #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \
@ -103,7 +102,8 @@
); \ ); \
} while (0) } while (0)
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst, static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst,
int length) { int length) {
DO_PREDICT_LINE(src, dst, length, 0); DO_PREDICT_LINE(src, dst, length, 0);
} }
@ -184,99 +184,75 @@ static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
// Horizontal filter. // Horizontal filter.
#define FILTER_LINE_BY_LINE do { \ #define FILTER_LINE_BY_LINE do { \
while (row < last_row) { \ for (row = 1; row < height; ++row) { \
PREDICT_LINE_ONE_PASS(in, preds - stride, out); \ PREDICT_LINE_ONE_PASS(in, preds - stride, out); \
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \ DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \
++row; \
preds += stride; \ preds += stride; \
in += stride; \ in += stride; \
out += stride; \ out += stride; \
} \ } \
} while (0) } while (0)
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in, static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(
int width, int height, const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
int stride, uint8_t* WEBP_RESTRICT out) {
int row, int num_rows, const uint8_t* preds = in;
uint8_t* out) { int row;
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
if (row == 0) { // Leftmost pixel is the same as input for topmost scanline.
// Leftmost pixel is the same as input for topmost scanline. out[0] = in[0];
out[0] = in[0]; PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1); preds += stride;
row = 1; in += stride;
preds += stride; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
FILTER_LINE_BY_LINE; FILTER_LINE_BY_LINE;
} }
#undef FILTER_LINE_BY_LINE #undef FILTER_LINE_BY_LINE
static void HorizontalFilter_MIPSdspR2(const uint8_t* data, static void HorizontalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
int width, int height, int width, int height, int stride,
int stride, uint8_t* filtered_data) { uint8_t* WEBP_RESTRICT filtered_data) {
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height, DoHorizontalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
filtered_data);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Vertical filter. // Vertical filter.
#define FILTER_LINE_BY_LINE do { \ #define FILTER_LINE_BY_LINE do { \
while (row < last_row) { \ for (row = 1; row < height; ++row) { \
DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \ DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \
++row; \
preds += stride; \ preds += stride; \
in += stride; \ in += stride; \
out += stride; \ out += stride; \
} \ } \
} while (0) } while (0)
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in, static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(
int width, int height, const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
int stride, uint8_t* WEBP_RESTRICT out) {
int row, int num_rows, const uint8_t* preds = in;
uint8_t* out) { int row;
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
if (row == 0) { // Very first top-left pixel is copied.
// Very first top-left pixel is copied. out[0] = in[0];
out[0] = in[0]; // Rest of top scan-line is left-predicted.
// Rest of top scan-line is left-predicted. PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line. // Filter line-by-line.
FILTER_LINE_BY_LINE; FILTER_LINE_BY_LINE;
} }
#undef FILTER_LINE_BY_LINE #undef FILTER_LINE_BY_LINE
static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height, static void VerticalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoVerticalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -297,7 +273,7 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
} }
#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \ #define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \
while (row < last_row) { \ for (row = 1; row < height; ++row) { \
int w; \ int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \ PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
for (w = 1; w < width; ++w) { \ for (w = 1; w < width; ++w) { \
@ -306,42 +282,34 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
PREDS[w - stride - 1]); \ PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \ out[w] = in[w] OPERATION pred; \
} \ } \
++row; \
in += stride; \ in += stride; \
out += stride; \ out += stride; \
} \ } \
} while (0) } while (0)
static void DoGradientFilter_MIPSdspR2(const uint8_t* in, static void DoGradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* out) { uint8_t* WEBP_RESTRICT out) {
const uint8_t* preds; const uint8_t* preds = in;
const size_t start_offset = row * stride; int row;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
// left prediction for top scan-line // left prediction for top scan-line
if (row == 0) { out[0] = in[0];
out[0] = in[0]; PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1); preds += stride;
row = 1; in += stride;
preds += stride; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
FILTER_LINE_BY_LINE(in, -); FILTER_LINE_BY_LINE(in, -);
} }
#undef FILTER_LINE_BY_LINE #undef FILTER_LINE_BY_LINE
static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height, static void GradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoGradientFilter_MIPSdspR2(data, width, height, stride, filtered_data);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -21,7 +21,8 @@
static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
const uint8_t* pred, const uint8_t* pred,
uint8_t* dst, int length) { uint8_t* WEBP_RESTRICT dst,
int length) {
v16u8 src0, pred0, dst0; v16u8 src0, pred0, dst0;
assert(length >= 0); assert(length >= 0);
while (length >= 32) { while (length >= 32) {
@ -58,8 +59,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
#define DCHECK(in, out) \ #define DCHECK(in, out) \
do { \ do { \
assert(in != NULL); \ assert((in) != NULL); \
assert(out != NULL); \ assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \ assert(width > 0); \
assert(height > 0); \ assert(height > 0); \
assert(stride >= width); \ assert(stride >= width); \
@ -68,8 +70,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Horrizontal filter // Horrizontal filter
static void HorizontalFilter_MSA(const uint8_t* data, int width, int height, static void HorizontalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
const uint8_t* preds = data; const uint8_t* preds = data;
const uint8_t* in = data; const uint8_t* in = data;
uint8_t* out = filtered_data; uint8_t* out = filtered_data;
@ -99,8 +102,8 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput, static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
const uint8_t* ppred, const uint8_t* ppred,
uint8_t* poutput, int stride, uint8_t* WEBP_RESTRICT poutput,
int size) { int stride, int size) {
int w; int w;
const v16i8 zero = { 0 }; const v16i8 zero = { 0 };
while (size >= 16) { while (size >= 16) {
@ -131,8 +134,9 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
} }
static void GradientFilter_MSA(const uint8_t* data, int width, int height, static void GradientFilter_MSA(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
const uint8_t* in = data; const uint8_t* in = data;
const uint8_t* preds = data; const uint8_t* preds = data;
uint8_t* out = filtered_data; uint8_t* out = filtered_data;
@ -159,8 +163,9 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Vertical filter // Vertical filter
static void VerticalFilter_MSA(const uint8_t* data, int width, int height, static void VerticalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
uint8_t* WEBP_RESTRICT filtered_data) {
const uint8_t* in = data; const uint8_t* in = data;
const uint8_t* preds = data; const uint8_t* preds = data;
uint8_t* out = filtered_data; uint8_t* out = filtered_data;

View File

@ -23,13 +23,12 @@
#define DCHECK(in, out) \ #define DCHECK(in, out) \
do { \ do { \
assert(in != NULL); \ assert((in) != NULL); \
assert(out != NULL); \ assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \ assert(width > 0); \
assert(height > 0); \ assert(height > 0); \
assert(stride >= width); \ assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0) } while (0)
// load eight u8 and widen to s16 // load eight u8 and widen to s16
@ -46,7 +45,7 @@
#define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8) #define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8)
static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred, static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) { uint8_t* WEBP_RESTRICT dst, int length) {
int i; int i;
assert(length >= 0); assert(length >= 0);
for (i = 0; i + 16 <= length; i += 16) { for (i = 0; i + 16 <= length; i += 16) {
@ -59,86 +58,70 @@ static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
} }
// Special case for left-based prediction (when preds==dst-1 or preds==src-1). // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) { static void PredictLineLeft_NEON(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst, int length) {
PredictLine_NEON(src, src - 1, dst, length); PredictLine_NEON(src, src - 1, dst, length);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Horizontal filter. // Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in, static WEBP_INLINE void DoHorizontalFilter_NEON(
int width, int height, const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
int stride, uint8_t* WEBP_RESTRICT out) {
int row, int num_rows, int row;
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) { // Leftmost pixel is the same as input for topmost scanline.
// Leftmost pixel is the same as input for topmost scanline. out[0] = in[0];
out[0] = in[0]; PredictLineLeft_NEON(in + 1, out + 1, width - 1);
PredictLineLeft_NEON(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
// Leftmost pixel is predicted from above. // Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride]; out[0] = in[0] - in[-stride];
PredictLineLeft_NEON(in + 1, out + 1, width - 1); PredictLineLeft_NEON(in + 1, out + 1, width - 1);
++row;
in += stride; in += stride;
out += stride; out += stride;
} }
} }
static void HorizontalFilter_NEON(const uint8_t* data, int width, int height, static void HorizontalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoHorizontalFilter_NEON(data, width, height, stride, 0, height, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoHorizontalFilter_NEON(data, width, height, stride, filtered_data);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Vertical filter. // Vertical filter.
static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in, static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* WEBP_RESTRICT out) {
uint8_t* out) { int row;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) { // Very first top-left pixel is copied.
// Very first top-left pixel is copied. out[0] = in[0];
out[0] = in[0]; // Rest of top scan-line is left-predicted.
// Rest of top scan-line is left-predicted. PredictLineLeft_NEON(in + 1, out + 1, width - 1);
PredictLineLeft_NEON(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
PredictLine_NEON(in, in - stride, out, width); PredictLine_NEON(in, in - stride, out, width);
++row;
in += stride; in += stride;
out += stride; out += stride;
} }
} }
static void VerticalFilter_NEON(const uint8_t* data, int width, int height, static void VerticalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoVerticalFilter_NEON(data, width, height, stride, 0, height, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoVerticalFilter_NEON(data, width, height, stride, filtered_data);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -151,7 +134,8 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
static void GradientPredictDirect_NEON(const uint8_t* const row, static void GradientPredictDirect_NEON(const uint8_t* const row,
const uint8_t* const top, const uint8_t* const top,
uint8_t* const out, int length) { uint8_t* WEBP_RESTRICT const out,
int length) {
int i; int i;
for (i = 0; i + 8 <= length; i += 8) { for (i = 0; i + 8 <= length; i += 8) {
const uint8x8_t A = vld1_u8(&row[i - 1]); const uint8x8_t A = vld1_u8(&row[i - 1]);
@ -167,40 +151,31 @@ static void GradientPredictDirect_NEON(const uint8_t* const row,
} }
} }
static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in, static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* WEBP_RESTRICT in,
int width, int height, int width, int height, int stride,
int stride, uint8_t* WEBP_RESTRICT out) {
int row, int num_rows, int row;
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line // left prediction for top scan-line
if (row == 0) { out[0] = in[0];
out[0] = in[0]; PredictLineLeft_NEON(in + 1, out + 1, width - 1);
PredictLineLeft_NEON(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
out[0] = in[0] - in[-stride]; out[0] = in[0] - in[-stride];
GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1); GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride; in += stride;
out += stride; out += stride;
} }
} }
static void GradientFilter_NEON(const uint8_t* data, int width, int height, static void GradientFilter_NEON(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoGradientFilter_NEON(data, width, height, stride, 0, height, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoGradientFilter_NEON(data, width, height, stride, filtered_data);
} }
#undef DCHECK #undef DCHECK

View File

@ -27,15 +27,15 @@
do { \ do { \
assert((in) != NULL); \ assert((in) != NULL); \
assert((out) != NULL); \ assert((out) != NULL); \
assert((in) != (out)); \
assert(width > 0); \ assert(width > 0); \
assert(height > 0); \ assert(height > 0); \
assert(stride >= width); \ assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; /* Silence unused warning. */ \
} while (0) } while (0)
static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred, static void PredictLineTop_SSE2(const uint8_t* WEBP_RESTRICT src,
uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT pred,
uint8_t* WEBP_RESTRICT dst, int length) {
int i; int i;
const int max_pos = length & ~31; const int max_pos = length & ~31;
assert(length >= 0); assert(length >= 0);
@ -53,7 +53,8 @@ static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
} }
// Special case for left-based prediction (when preds==dst-1 or preds==src-1). // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) { static void PredictLineLeft_SSE2(const uint8_t* WEBP_RESTRICT src,
uint8_t* WEBP_RESTRICT dst, int length) {
int i; int i;
const int max_pos = length & ~31; const int max_pos = length & ~31;
assert(length >= 0); assert(length >= 0);
@ -73,32 +74,23 @@ static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Horizontal filter. // Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in, static WEBP_INLINE void DoHorizontalFilter_SSE2(
int width, int height, const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
int stride, uint8_t* WEBP_RESTRICT out) {
int row, int num_rows, int row;
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) { // Leftmost pixel is the same as input for topmost scanline.
// Leftmost pixel is the same as input for topmost scanline. out[0] = in[0];
out[0] = in[0]; PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
// Leftmost pixel is predicted from above. // Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride]; out[0] = in[0] - in[-stride];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1); PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
++row;
in += stride; in += stride;
out += stride; out += stride;
} }
@ -107,30 +99,22 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Vertical filter. // Vertical filter.
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in, static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* WEBP_RESTRICT out) {
uint8_t* out) { int row;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) { // Very first top-left pixel is copied.
// Very first top-left pixel is copied. out[0] = in[0];
out[0] = in[0]; // Rest of top scan-line is left-predicted.
// Rest of top scan-line is left-predicted. PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
PredictLineTop_SSE2(in, in - stride, out, width); PredictLineTop_SSE2(in, in - stride, out, width);
++row;
in += stride; in += stride;
out += stride; out += stride;
} }
@ -146,7 +130,8 @@ static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
static void GradientPredictDirect_SSE2(const uint8_t* const row, static void GradientPredictDirect_SSE2(const uint8_t* const row,
const uint8_t* const top, const uint8_t* const top,
uint8_t* const out, int length) { uint8_t* WEBP_RESTRICT const out,
int length) {
const int max_pos = length & ~7; const int max_pos = length & ~7;
int i; int i;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
@ -170,30 +155,22 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
} }
} }
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
int width, int height, int stride, int width, int height, int stride,
int row, int num_rows, uint8_t* WEBP_RESTRICT out) {
uint8_t* out) { int row;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
DCHECK(in, out); DCHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line // left prediction for top scan-line
if (row == 0) { out[0] = in[0];
out[0] = in[0]; PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1); in += stride;
row = 1; out += stride;
in += stride;
out += stride;
}
// Filter line-by-line. // Filter line-by-line.
while (row < last_row) { for (row = 1; row < height; ++row) {
out[0] = (uint8_t)(in[0] - in[-stride]); out[0] = (uint8_t)(in[0] - in[-stride]);
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1); GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride; in += stride;
out += stride; out += stride;
} }
@ -203,20 +180,22 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height, static void HorizontalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoHorizontalFilter_SSE2(data, width, height, stride, 0, height, uint8_t* WEBP_RESTRICT filtered_data) {
filtered_data); DoHorizontalFilter_SSE2(data, width, height, stride, filtered_data);
} }
static void VerticalFilter_SSE2(const uint8_t* data, int width, int height, static void VerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data); uint8_t* WEBP_RESTRICT filtered_data) {
DoVerticalFilter_SSE2(data, width, height, stride, filtered_data);
} }
static void GradientFilter_SSE2(const uint8_t* data, int width, int height, static void GradientFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
int stride, uint8_t* filtered_data) { int width, int height, int stride,
DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data); uint8_t* WEBP_RESTRICT filtered_data) {
DoGradientFilter_SSE2(data, width, height, stride, filtered_data);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -107,14 +107,14 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Predictors // Predictors
uint32_t VP8LPredictor0_C(const uint32_t* const left, static uint32_t VP8LPredictor0_C(const uint32_t* const left,
const uint32_t* const top) { const uint32_t* const top) {
(void)top; (void)top;
(void)left; (void)left;
return ARGB_BLACK; return ARGB_BLACK;
} }
uint32_t VP8LPredictor1_C(const uint32_t* const left, static uint32_t VP8LPredictor1_C(const uint32_t* const left,
const uint32_t* const top) { const uint32_t* const top) {
(void)top; (void)top;
return *left; return *left;
} }
@ -182,13 +182,13 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
} }
static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper, static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int x; int x;
(void)upper; (void)upper;
for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK); for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
} }
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper, static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint32_t left = out[-1]; uint32_t left = out[-1];
(void)upper; (void)upper;
@ -441,8 +441,8 @@ static int is_big_endian(void) {
return (tmp.b[0] != 1); return (tmp.b[0] != 1);
} }
void VP8LConvertBGRAToRGB_C(const uint32_t* src, void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels; const uint32_t* const src_end = src + num_pixels;
while (src < src_end) { while (src < src_end) {
const uint32_t argb = *src++; const uint32_t argb = *src++;
@ -452,8 +452,8 @@ void VP8LConvertBGRAToRGB_C(const uint32_t* src,
} }
} }
void VP8LConvertBGRAToRGBA_C(const uint32_t* src, void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels; const uint32_t* const src_end = src + num_pixels;
while (src < src_end) { while (src < src_end) {
const uint32_t argb = *src++; const uint32_t argb = *src++;
@ -464,8 +464,8 @@ void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
} }
} }
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels; const uint32_t* const src_end = src + num_pixels;
while (src < src_end) { while (src < src_end) {
const uint32_t argb = *src++; const uint32_t argb = *src++;
@ -481,8 +481,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
} }
} }
void VP8LConvertBGRAToRGB565_C(const uint32_t* src, void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels; const uint32_t* const src_end = src + num_pixels;
while (src < src_end) { while (src < src_end) {
const uint32_t argb = *src++; const uint32_t argb = *src++;
@ -498,8 +498,8 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
} }
} }
void VP8LConvertBGRAToBGR_C(const uint32_t* src, void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const src_end = src + num_pixels; const uint32_t* const src_end = src + num_pixels;
while (src < src_end) { while (src < src_end) {
const uint32_t argb = *src++; const uint32_t argb = *src++;
@ -509,8 +509,8 @@ void VP8LConvertBGRAToBGR_C(const uint32_t* src,
} }
} }
static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, static void CopyOrSwap(const uint32_t* WEBP_RESTRICT src, int num_pixels,
int swap_on_big_endian) { uint8_t* WEBP_RESTRICT dst, int swap_on_big_endian) {
if (is_big_endian() == swap_on_big_endian) { if (is_big_endian() == swap_on_big_endian) {
const uint32_t* const src_end = src + num_pixels; const uint32_t* const src_end = src + num_pixels;
while (src < src_end) { while (src < src_end) {

View File

@ -18,6 +18,7 @@
#include "src/webp/types.h" #include "src/webp/types.h"
#include "src/webp/decode.h" #include "src/webp/decode.h"
#include "src/dsp/dsp.h"
#include "src/enc/histogram_enc.h" #include "src/enc/histogram_enc.h"
#include "src/utils/utils.h" #include "src/utils/utils.h"
@ -32,10 +33,6 @@ typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
const uint32_t* const top); const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16]; extern VP8LPredictorFunc VP8LPredictors[16];
uint32_t VP8LPredictor0_C(const uint32_t* const left,
const uint32_t* const top);
uint32_t VP8LPredictor1_C(const uint32_t* const left,
const uint32_t* const top);
uint32_t VP8LPredictor2_C(const uint32_t* const left, uint32_t VP8LPredictor2_C(const uint32_t* const left,
const uint32_t* const top); const uint32_t* const top);
uint32_t VP8LPredictor3_C(const uint32_t* const left, uint32_t VP8LPredictor3_C(const uint32_t* const left,
@ -64,7 +61,7 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
// These Add/Sub function expects upper[-1] and out[-1] to be readable. // These Add/Sub function expects upper[-1] and out[-1] to be readable.
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in, typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
const uint32_t* upper, int num_pixels, const uint32_t* upper, int num_pixels,
uint32_t* out); uint32_t* WEBP_RESTRICT out);
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16]; extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16]; extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
@ -95,8 +92,8 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
const uint32_t* const in, uint32_t* const out); const uint32_t* const in, uint32_t* const out);
// Color space conversion. // Color space conversion.
typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels, typedef void (*VP8LConvertFunc)(const uint32_t* WEBP_RESTRICT src,
uint8_t* dst); int num_pixels, uint8_t* WEBP_RESTRICT dst);
extern VP8LConvertFunc VP8LConvertBGRAToRGB; extern VP8LConvertFunc VP8LConvertBGRAToRGB;
extern VP8LConvertFunc VP8LConvertBGRAToRGBA; extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444; extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
@ -131,13 +128,16 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels, const uint32_t* src, int num_pixels,
uint32_t* dst); uint32_t* dst);
void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst); void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst); uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
int num_pixels, uint8_t* dst); uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToRGB565_C(const uint32_t* src, void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst); int num_pixels, uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst); void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* WEBP_RESTRICT dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
uint8_t* WEBP_RESTRICT dst);
void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels, void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
uint32_t* dst); uint32_t* dst);
@ -149,32 +149,35 @@ void VP8LDspInit(void);
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels); typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, typedef void (*VP8LTransformColorFunc)(
uint32_t* dst, int num_pixels); const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT dst,
int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor; extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)( typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride, const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int histo[]); int green_to_blue, int red_to_blue, uint32_t histo[]);
extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms; extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
typedef void (*VP8LCollectColorRedTransformsFunc)( typedef void (*VP8LCollectColorRedTransformsFunc)(
const uint32_t* argb, int stride, const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_red, int histo[]); int green_to_red, uint32_t histo[]);
extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms; extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
// Expose some C-only fallback functions // Expose some C-only fallback functions
void VP8LTransformColor_C(const VP8LMultipliers* const m, void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* data, int num_pixels); uint32_t* WEBP_RESTRICT data, int num_pixels);
void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels); void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride, void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_red, int histo[]); int green_to_red, uint32_t histo[]);
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int green_to_blue, int red_to_blue,
int histo[]); uint32_t histo[]);
extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16]; extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16]; extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
@ -183,14 +186,17 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
// Huffman-cost related functions. // Huffman-cost related functions.
typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length); typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* WEBP_RESTRICT X,
const uint32_t* WEBP_RESTRICT Y,
int length); int length);
typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256], typedef uint64_t (*VP8LCombinedShannonEntropyFunc)(const uint32_t X[256],
const int Y[256]); const uint32_t Y[256]);
typedef uint64_t (*VP8LShannonEntropyFunc)(const uint32_t* X, int length);
extern VP8LCostFunc VP8LExtraCost; extern VP8LCostFunc VP8LExtraCost;
extern VP8LCostCombinedFunc VP8LExtraCostCombined; extern VP8LCostCombinedFunc VP8LExtraCostCombined;
extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
extern VP8LShannonEntropyFunc VP8LShannonEntropy;
typedef struct { // small struct to hold counters typedef struct { // small struct to hold counters
int counts[2]; // index: 0=zero streak, 1=non-zero streak int counts[2]; // index: 0=zero streak, 1=non-zero streak
@ -198,7 +204,7 @@ typedef struct { // small struct to hold counters
} VP8LStreaks; } VP8LStreaks;
typedef struct { // small struct to hold bit entropy results typedef struct { // small struct to hold bit entropy results
float entropy; // entropy uint64_t entropy; // entropy
uint32_t sum; // sum of the population uint32_t sum; // sum of the population
int nonzeros; // number of non-zero elements in the population int nonzeros; // number of non-zero elements in the population
uint32_t max_val; // maximum value in the population uint32_t max_val; // maximum value in the population
@ -212,26 +218,30 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
// codec specific heuristics. // codec specific heuristics.
typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)( typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
const uint32_t X[], const uint32_t Y[], int length, const uint32_t X[], const uint32_t Y[], int length,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats); VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats);
extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined; extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
// Get the entropy for the distribution 'X'. // Get the entropy for the distribution 'X'.
typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length, typedef void (*VP8LGetEntropyUnrefinedFunc)(
VP8LBitEntropy* const bit_entropy, const uint32_t X[], int length,
VP8LStreaks* const stats); VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats);
extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
VP8LBitEntropy* const entropy); VP8LBitEntropy* WEBP_RESTRICT const entropy);
typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b, typedef void (*VP8LAddVectorFunc)(const uint32_t* WEBP_RESTRICT a,
uint32_t* out, int size); const uint32_t* WEBP_RESTRICT b,
uint32_t* WEBP_RESTRICT out, int size);
extern VP8LAddVectorFunc VP8LAddVector; extern VP8LAddVectorFunc VP8LAddVector;
typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size); typedef void (*VP8LAddVectorEqFunc)(const uint32_t* WEBP_RESTRICT a,
uint32_t* WEBP_RESTRICT out, int size);
extern VP8LAddVectorEqFunc VP8LAddVectorEq; extern VP8LAddVectorEqFunc VP8LAddVectorEq;
void VP8LHistogramAdd(const VP8LHistogram* const a, void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
const VP8LHistogram* const b, const VP8LHistogram* WEBP_RESTRICT const b,
VP8LHistogram* const out); VP8LHistogram* WEBP_RESTRICT const out);
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// PrefixEncode() // PrefixEncode()
@ -241,11 +251,12 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
// Returns the first index where array1 and array2 are different. // Returns the first index where array1 and array2 are different.
extern VP8LVectorMismatchFunc VP8LVectorMismatch; extern VP8LVectorMismatchFunc VP8LVectorMismatch;
typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width, typedef void (*VP8LBundleColorMapFunc)(const uint8_t* WEBP_RESTRICT const row,
int xbits, uint32_t* dst); int width, int xbits,
uint32_t* WEBP_RESTRICT dst);
extern VP8LBundleColorMapFunc VP8LBundleColorMap; extern VP8LBundleColorMapFunc VP8LBundleColorMap;
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
uint32_t* dst); int width, int xbits, uint32_t* WEBP_RESTRICT dst);
// Must be called before calling any of the above methods. // Must be called before calling any of the above methods.
void VP8LEncDspInit(void); void VP8LEncDspInit(void);

View File

@ -73,23 +73,44 @@ static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
// Keeping a high threshold for now. // Keeping a high threshold for now.
#define APPROX_LOG_WITH_CORRECTION_MAX 65536 #define APPROX_LOG_WITH_CORRECTION_MAX 65536
#define APPROX_LOG_MAX 4096 #define APPROX_LOG_MAX 4096
// VP8LFastLog2 and VP8LFastSLog2 are used on elements from image histograms.
// The histogram values cannot exceed the maximum number of pixels, which
// is (1 << 14) * (1 << 14). Therefore S * log(S) < (1 << 33).
// No more than 32 bits of precision should be chosen.
// To match the original float implementation, 23 bits of precision are used.
#define LOG_2_PRECISION_BITS 23
#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
// LOG_2_RECIPROCAL * (1 << LOG_2_PRECISION_BITS)
#define LOG_2_RECIPROCAL_FIXED_DOUBLE 12102203.161561485379934310913085937500
#define LOG_2_RECIPROCAL_FIXED ((uint64_t)12102203)
#define LOG_LOOKUP_IDX_MAX 256 #define LOG_LOOKUP_IDX_MAX 256
extern const float kLog2Table[LOG_LOOKUP_IDX_MAX]; extern const uint32_t kLog2Table[LOG_LOOKUP_IDX_MAX];
extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX]; extern const uint64_t kSLog2Table[LOG_LOOKUP_IDX_MAX];
typedef float (*VP8LFastLog2SlowFunc)(uint32_t v); typedef uint32_t (*VP8LFastLog2SlowFunc)(uint32_t v);
typedef uint64_t (*VP8LFastSLog2SlowFunc)(uint32_t v);
extern VP8LFastLog2SlowFunc VP8LFastLog2Slow; extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow; extern VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
static WEBP_INLINE float VP8LFastLog2(uint32_t v) { static WEBP_INLINE uint32_t VP8LFastLog2(uint32_t v) {
return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v); return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
} }
// Fast calculation of v * log2(v) for integer input. // Fast calculation of v * log2(v) for integer input.
static WEBP_INLINE float VP8LFastSLog2(uint32_t v) { static WEBP_INLINE uint64_t VP8LFastSLog2(uint32_t v) {
return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v); return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
} }
static WEBP_INLINE uint64_t RightShiftRound(uint64_t v, uint32_t shift) {
return (v + (1ull << shift >> 1)) >> shift;
}
static WEBP_INLINE int64_t DivRound(int64_t a, int64_t b) {
return ((a < 0) == (b < 0)) ? ((a + b / 2) / b) : ((a - b / 2) / b);
}
#define WEBP_INT64_MAX ((int64_t)((1ull << 63) - 1))
#define WEBP_UINT64_MAX (~0ull)
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// PrefixEncode() // PrefixEncode()
@ -173,15 +194,15 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
// The predictor is added to the output pixel (which // The predictor is added to the output pixel (which
// is therefore considered as a residual) to get the final prediction. // is therefore considered as a residual) to get the final prediction.
#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \ #define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \
static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \ int num_pixels, uint32_t* WEBP_RESTRICT out) { \
int x; \ int x; \
assert(upper != NULL); \ assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \ for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \ const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \
out[x] = VP8LAddPixels(in[x], pred); \ out[x] = VP8LAddPixels(in[x], pred); \
} \ } \
} }
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -24,203 +24,123 @@
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h" #include "src/dsp/yuv.h"
// lookup table for small values of log2(int) // lookup table for small values of log2(int) * (1 << LOG_2_PRECISION_BITS).
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = { // Obtained in Python with:
0.0000000000000000f, 0.0000000000000000f, // a = [ str(round((1<<23)*math.log2(i))) if i else "0" for i in range(256)]
1.0000000000000000f, 1.5849625007211560f, // print(',\n'.join([' '+','.join(v)
2.0000000000000000f, 2.3219280948873621f, // for v in batched([i.rjust(9) for i in a],7)]))
2.5849625007211560f, 2.8073549220576041f, const uint32_t kLog2Table[LOG_LOOKUP_IDX_MAX] = {
3.0000000000000000f, 3.1699250014423121f, 0, 0, 8388608, 13295629, 16777216, 19477745, 21684237,
3.3219280948873621f, 3.4594316186372973f, 23549800, 25165824, 26591258, 27866353, 29019816, 30072845, 31041538,
3.5849625007211560f, 3.7004397181410921f, 31938408, 32773374, 33554432, 34288123, 34979866, 35634199, 36254961,
3.8073549220576041f, 3.9068905956085187f, 36845429, 37408424, 37946388, 38461453, 38955489, 39430146, 39886887,
4.0000000000000000f, 4.0874628412503390f, 40327016, 40751698, 41161982, 41558811, 41943040, 42315445, 42676731,
4.1699250014423121f, 4.2479275134435852f, 43027545, 43368474, 43700062, 44022807, 44337167, 44643569, 44942404,
4.3219280948873626f, 4.3923174227787606f, 45234037, 45518808, 45797032, 46069003, 46334996, 46595268, 46850061,
4.4594316186372973f, 4.5235619560570130f, 47099600, 47344097, 47583753, 47818754, 48049279, 48275495, 48497560,
4.5849625007211560f, 4.6438561897747243f, 48715624, 48929828, 49140306, 49347187, 49550590, 49750631, 49947419,
4.7004397181410917f, 4.7548875021634682f, 50141058, 50331648, 50519283, 50704053, 50886044, 51065339, 51242017,
4.8073549220576037f, 4.8579809951275718f, 51416153, 51587818, 51757082, 51924012, 52088670, 52251118, 52411415,
4.9068905956085187f, 4.9541963103868749f, 52569616, 52725775, 52879946, 53032177, 53182516, 53331012, 53477707,
5.0000000000000000f, 5.0443941193584533f, 53622645, 53765868, 53907416, 54047327, 54185640, 54322389, 54457611,
5.0874628412503390f, 5.1292830169449663f, 54591338, 54723604, 54854440, 54983876, 55111943, 55238669, 55364082,
5.1699250014423121f, 5.2094533656289501f, 55488208, 55611074, 55732705, 55853126, 55972361, 56090432, 56207362,
5.2479275134435852f, 5.2854022188622487f, 56323174, 56437887, 56551524, 56664103, 56775645, 56886168, 56995691,
5.3219280948873626f, 5.3575520046180837f, 57104232, 57211808, 57318436, 57424133, 57528914, 57632796, 57735795,
5.3923174227787606f, 5.4262647547020979f, 57837923, 57939198, 58039632, 58139239, 58238033, 58336027, 58433234,
5.4594316186372973f, 5.4918530963296747f, 58529666, 58625336, 58720256, 58814437, 58907891, 59000628, 59092661,
5.5235619560570130f, 5.5545888516776376f, 59183999, 59274652, 59364632, 59453947, 59542609, 59630625, 59718006,
5.5849625007211560f, 5.6147098441152083f, 59804761, 59890898, 59976426, 60061354, 60145690, 60229443, 60312620,
5.6438561897747243f, 5.6724253419714951f, 60395229, 60477278, 60558775, 60639726, 60720140, 60800023, 60879382,
5.7004397181410917f, 5.7279204545631987f, 60958224, 61036555, 61114383, 61191714, 61268554, 61344908, 61420785,
5.7548875021634682f, 5.7813597135246599f, 61496188, 61571124, 61645600, 61719620, 61793189, 61866315, 61939001,
5.8073549220576037f, 5.8328900141647412f, 62011253, 62083076, 62154476, 62225457, 62296024, 62366182, 62435935,
5.8579809951275718f, 5.8826430493618415f, 62505289, 62574248, 62642816, 62710997, 62778797, 62846219, 62913267,
5.9068905956085187f, 5.9307373375628866f, 62979946, 63046260, 63112212, 63177807, 63243048, 63307939, 63372484,
5.9541963103868749f, 5.9772799234999167f, 63436687, 63500551, 63564080, 63627277, 63690146, 63752690, 63814912,
6.0000000000000000f, 6.0223678130284543f, 63876816, 63938405, 63999682, 64060650, 64121313, 64181673, 64241734,
6.0443941193584533f, 6.0660891904577720f, 64301498, 64360969, 64420148, 64479040, 64537646, 64595970, 64654014,
6.0874628412503390f, 6.1085244567781691f, 64711782, 64769274, 64826495, 64883447, 64940132, 64996553, 65052711,
6.1292830169449663f, 6.1497471195046822f, 65108611, 65164253, 65219641, 65274776, 65329662, 65384299, 65438691,
6.1699250014423121f, 6.1898245588800175f, 65492840, 65546747, 65600416, 65653847, 65707044, 65760008, 65812741,
6.2094533656289501f, 6.2288186904958804f, 65865245, 65917522, 65969575, 66021404, 66073013, 66124403, 66175575,
6.2479275134435852f, 6.2667865406949010f, 66226531, 66277275, 66327806, 66378127, 66428240, 66478146, 66527847,
6.2854022188622487f, 6.3037807481771030f, 66577345, 66626641, 66675737, 66724635, 66773336, 66821842, 66870154,
6.3219280948873626f, 6.3398500028846243f, 66918274, 66966204, 67013944, 67061497
6.3575520046180837f, 6.3750394313469245f,
6.3923174227787606f, 6.4093909361377017f,
6.4262647547020979f, 6.4429434958487279f,
6.4594316186372973f, 6.4757334309663976f,
6.4918530963296747f, 6.5077946401986963f,
6.5235619560570130f, 6.5391588111080309f,
6.5545888516776376f, 6.5698556083309478f,
6.5849625007211560f, 6.5999128421871278f,
6.6147098441152083f, 6.6293566200796094f,
6.6438561897747243f, 6.6582114827517946f,
6.6724253419714951f, 6.6865005271832185f,
6.7004397181410917f, 6.7142455176661224f,
6.7279204545631987f, 6.7414669864011464f,
6.7548875021634682f, 6.7681843247769259f,
6.7813597135246599f, 6.7944158663501061f,
6.8073549220576037f, 6.8201789624151878f,
6.8328900141647412f, 6.8454900509443747f,
6.8579809951275718f, 6.8703647195834047f,
6.8826430493618415f, 6.8948177633079437f,
6.9068905956085187f, 6.9188632372745946f,
6.9307373375628866f, 6.9425145053392398f,
6.9541963103868749f, 6.9657842846620869f,
6.9772799234999167f, 6.9886846867721654f,
7.0000000000000000f, 7.0112272554232539f,
7.0223678130284543f, 7.0334230015374501f,
7.0443941193584533f, 7.0552824355011898f,
7.0660891904577720f, 7.0768155970508308f,
7.0874628412503390f, 7.0980320829605263f,
7.1085244567781691f, 7.1189410727235076f,
7.1292830169449663f, 7.1395513523987936f,
7.1497471195046822f, 7.1598713367783890f,
7.1699250014423121f, 7.1799090900149344f,
7.1898245588800175f, 7.1996723448363644f,
7.2094533656289501f, 7.2191685204621611f,
7.2288186904958804f, 7.2384047393250785f,
7.2479275134435852f, 7.2573878426926521f,
7.2667865406949010f, 7.2761244052742375f,
7.2854022188622487f, 7.2946207488916270f,
7.3037807481771030f, 7.3128829552843557f,
7.3219280948873626f, 7.3309168781146167f,
7.3398500028846243f, 7.3487281542310771f,
7.3575520046180837f, 7.3663222142458160f,
7.3750394313469245f, 7.3837042924740519f,
7.3923174227787606f, 7.4008794362821843f,
7.4093909361377017f, 7.4178525148858982f,
7.4262647547020979f, 7.4346282276367245f,
7.4429434958487279f, 7.4512111118323289f,
7.4594316186372973f, 7.4676055500829976f,
7.4757334309663976f, 7.4838157772642563f,
7.4918530963296747f, 7.4998458870832056f,
7.5077946401986963f, 7.5156998382840427f,
7.5235619560570130f, 7.5313814605163118f,
7.5391588111080309f, 7.5468944598876364f,
7.5545888516776376f, 7.5622424242210728f,
7.5698556083309478f, 7.5774288280357486f,
7.5849625007211560f, 7.5924570372680806f,
7.5999128421871278f, 7.6073303137496104f,
7.6147098441152083f, 7.6220518194563764f,
7.6293566200796094f, 7.6366246205436487f,
7.6438561897747243f, 7.6510516911789281f,
7.6582114827517946f, 7.6653359171851764f,
7.6724253419714951f, 7.6794800995054464f,
7.6865005271832185f, 7.6934869574993252f,
7.7004397181410917f, 7.7073591320808825f,
7.7142455176661224f, 7.7210991887071855f,
7.7279204545631987f, 7.7347096202258383f,
7.7414669864011464f, 7.7481928495894605f,
7.7548875021634682f, 7.7615512324444795f,
7.7681843247769259f, 7.7747870596011736f,
7.7813597135246599f, 7.7879025593914317f,
7.7944158663501061f, 7.8008998999203047f,
7.8073549220576037f, 7.8137811912170374f,
7.8201789624151878f, 7.8265484872909150f,
7.8328900141647412f, 7.8392037880969436f,
7.8454900509443747f, 7.8517490414160571f,
7.8579809951275718f, 7.8641861446542797f,
7.8703647195834047f, 7.8765169465649993f,
7.8826430493618415f, 7.8887432488982591f,
7.8948177633079437f, 7.9008668079807486f,
7.9068905956085187f, 7.9128893362299619f,
7.9188632372745946f, 7.9248125036057812f,
7.9307373375628866f, 7.9366379390025709f,
7.9425145053392398f, 7.9483672315846778f,
7.9541963103868749f, 7.9600019320680805f,
7.9657842846620869f, 7.9715435539507719f,
7.9772799234999167f, 7.9829935746943103f,
7.9886846867721654f, 7.9943534368588577f
}; };
const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = { // lookup table for small values of int*log2(int) * (1 << LOG_2_PRECISION_BITS).
0.00000000f, 0.00000000f, 2.00000000f, 4.75488750f, // Obtained in Python with:
8.00000000f, 11.60964047f, 15.50977500f, 19.65148445f, // a=[ "%d"%i if i<(1<<32) else "%dull"%i
24.00000000f, 28.52932501f, 33.21928095f, 38.05374781f, // for i in [ round((1<<LOG_2_PRECISION_BITS)*math.log2(i)*i) if i
43.01955001f, 48.10571634f, 53.30296891f, 58.60335893f, // else 0 for i in range(256)]]
64.00000000f, 69.48686830f, 75.05865003f, 80.71062276f, // print(',\n '.join([','.join(v) for v in batched([i.rjust(15)
86.43856190f, 92.23866588f, 98.10749561f, 104.04192499f, // for i in a],4)]))
110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f, const uint64_t kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f, 0, 0, 16777216, 39886887,
160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f, 67108864, 97388723, 130105423, 164848600,
186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f, 201326592, 239321324, 278663526, 319217973,
212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f, 360874141, 403539997, 447137711, 491600606,
240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f, 536870912, 582898099, 629637592, 677049776,
268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f, 725099212, 773754010, 822985323, 872766924,
296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f, 923074875, 973887230, 1025183802, 1076945958,
325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f, 1129156447, 1181799249, 1234859451, 1288323135,
354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f, 1342177280, 1396409681, 1451008871, 1505964059,
384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f, 1561265072, 1616902301, 1672866655, 1729149526,
413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f, 1785742744, 1842638548, 1899829557, 1957308741,
444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f, 2015069397, 2073105127, 2131409817, 2189977618ull,
474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f, 2248802933ull, 2307880396ull, 2367204859ull, 2426771383ull,
505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f, 2486575220ull, 2546611805ull, 2606876748ull, 2667365819ull,
536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f, 2728074942ull, 2789000187ull, 2850137762ull, 2911484006ull,
568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f, 2973035382ull, 3034788471ull, 3096739966ull, 3158886666ull,
600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f, 3221225472ull, 3283753383ull, 3346467489ull, 3409364969ull,
632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f, 3472443085ull, 3535699182ull, 3599130679ull, 3662735070ull,
664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f, 3726509920ull, 3790452862ull, 3854561593ull, 3918833872ull,
696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f, 3983267519ull, 4047860410ull, 4112610476ull, 4177515704ull,
729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f, 4242574127ull, 4307783833ull, 4373142952ull, 4438649662ull,
762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f, 4504302186ull, 4570098787ull, 4636037770ull, 4702117480ull,
795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f, 4768336298ull, 4834692645ull, 4901184974ull, 4967811774ull,
828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f, 5034571569ull, 5101462912ull, 5168484389ull, 5235634615ull,
862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f, 5302912235ull, 5370315922ull, 5437844376ull, 5505496324ull,
896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f, 5573270518ull, 5641165737ull, 5709180782ull, 5777314477ull,
929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f, 5845565671ull, 5913933235ull, 5982416059ull, 6051013057ull,
963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f, 6119723161ull, 6188545324ull, 6257478518ull, 6326521733ull,
998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f, 6395673979ull, 6464934282ull, 6534301685ull, 6603775250ull,
1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f, 6673354052ull, 6743037185ull, 6812823756ull, 6882712890ull,
1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f, 6952703725ull, 7022795412ull, 7092987118ull, 7163278025ull,
1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f, 7233667324ull, 7304154222ull, 7374737939ull, 7445417707ull,
1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f, 7516192768ull, 7587062379ull, 7658025806ull, 7729082328ull,
1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f, 7800231234ull, 7871471825ull, 7942803410ull, 8014225311ull,
1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f, 8085736859ull, 8157337394ull, 8229026267ull, 8300802839ull,
1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f, 8372666477ull, 8444616560ull, 8516652476ull, 8588773618ull,
1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f, 8660979393ull, 8733269211ull, 8805642493ull, 8878098667ull,
1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f, 8950637170ull, 9023257446ull, 9095958945ull, 9168741125ull,
1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f, 9241603454ull, 9314545403ull, 9387566451ull, 9460666086ull,
1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f, 9533843800ull, 9607099093ull, 9680431471ull, 9753840445ull,
1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f, 9827325535ull, 9900886263ull, 9974522161ull, 10048232765ull,
1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f, 10122017615ull, 10195876260ull, 10269808253ull, 10343813150ull,
1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f, 10417890516ull, 10492039919ull, 10566260934ull, 10640553138ull,
1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f, 10714916116ull, 10789349456ull, 10863852751ull, 10938425600ull,
1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f, 11013067604ull, 11087778372ull, 11162557513ull, 11237404645ull,
1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f, 11312319387ull, 11387301364ull, 11462350205ull, 11537465541ull,
1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f, 11612647010ull, 11687894253ull, 11763206912ull, 11838584638ull,
1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f, 11914027082ull, 11989533899ull, 12065104750ull, 12140739296ull,
1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f, 12216437206ull, 12292198148ull, 12368021795ull, 12443907826ull,
1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f, 12519855920ull, 12595865759ull, 12671937032ull, 12748069427ull,
1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f, 12824262637ull, 12900516358ull, 12976830290ull, 13053204134ull,
1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f, 13129637595ull, 13206130381ull, 13282682202ull, 13359292772ull,
1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f, 13435961806ull, 13512689025ull, 13589474149ull, 13666316903ull,
1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f, 13743217014ull, 13820174211ull, 13897188225ull, 13974258793ull,
1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f, 14051385649ull, 14128568535ull, 14205807192ull, 14283101363ull,
1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f, 14360450796ull, 14437855239ull, 14515314443ull, 14592828162ull,
2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f 14670396151ull, 14748018167ull, 14825693972ull, 14903423326ull,
14981205995ull, 15059041743ull, 15136930339ull, 15214871554ull,
15292865160ull, 15370910930ull, 15449008641ull, 15527158071ull,
15605359001ull, 15683611210ull, 15761914485ull, 15840268608ull,
15918673369ull, 15997128556ull, 16075633960ull, 16154189373ull,
16232794589ull, 16311449405ull, 16390153617ull, 16468907026ull,
16547709431ull, 16626560636ull, 16705460444ull, 16784408661ull,
16863405094ull, 16942449552ull, 17021541845ull, 17100681785ull
}; };
const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = { const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
@ -326,23 +246,19 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
}; };
static float FastSLog2Slow_C(uint32_t v) { static uint64_t FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX); assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) { if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
const uint64_t orig_v = v;
uint64_t correction;
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ) #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available // use clz if available
const int log_cnt = BitsLog2Floor(v) - 7; const uint64_t log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt; const uint32_t y = 1 << log_cnt;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
v >>= log_cnt; v >>= log_cnt;
#else #else
int log_cnt = 0; uint64_t log_cnt = 0;
uint32_t y = 1; uint32_t y = 1;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
do { do {
++log_cnt; ++log_cnt;
v = v >> 1; v = v >> 1;
@ -354,45 +270,43 @@ static float FastSLog2Slow_C(uint32_t v) {
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so // The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16 correction = LOG_2_RECIPROCAL_FIXED * (orig_v & (y - 1));
correction = (23 * (orig_v & (y - 1))) >> 4; return orig_v * (kLog2Table[v] + (log_cnt << LOG_2_PRECISION_BITS)) +
return v_f * (kLog2Table[v] + log_cnt) + correction; correction;
} else { } else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v)); return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);
} }
} }
static float FastLog2Slow_C(uint32_t v) { static uint32_t FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX); assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) { if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
const uint32_t orig_v = v;
uint32_t log_2;
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ) #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available // use clz if available
const int log_cnt = BitsLog2Floor(v) - 7; const uint32_t log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt; const uint32_t y = 1 << log_cnt;
const uint32_t orig_v = v;
double log_2;
v >>= log_cnt; v >>= log_cnt;
#else #else
int log_cnt = 0; uint32_t log_cnt = 0;
uint32_t y = 1; uint32_t y = 1;
const uint32_t orig_v = v;
double log_2;
do { do {
++log_cnt; ++log_cnt;
v = v >> 1; v = v >> 1;
y = y << 1; y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX); } while (v >= LOG_LOOKUP_IDX_MAX);
#endif #endif
log_2 = kLog2Table[v] + log_cnt; log_2 = kLog2Table[v] + (log_cnt << LOG_2_PRECISION_BITS);
if (orig_v >= APPROX_LOG_MAX) { if (orig_v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only // Since the division is still expensive, add this correction factor only
// for large values of 'v'. // for large values of 'v'.
const int correction = (23 * (orig_v & (y - 1))) >> 4; const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (orig_v & (y - 1));
log_2 += (double)correction / orig_v; log_2 += (uint32_t)DivRound(correction, orig_v);
} }
return (float)log_2; return log_2;
} else { } else {
return (float)(LOG_2_RECIPROCAL * log((double)v)); return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);
} }
} }
@ -400,37 +314,53 @@ static float FastLog2Slow_C(uint32_t v) {
// Methods to calculate Entropy (Shannon). // Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y} // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) { static uint64_t CombinedShannonEntropy_C(const uint32_t X[256],
const uint32_t Y[256]) {
int i; int i;
float retval = 0.f; uint64_t retval = 0;
int sumX = 0, sumXY = 0; uint32_t sumX = 0, sumXY = 0;
for (i = 0; i < 256; ++i) { for (i = 0; i < 256; ++i) {
const int x = X[i]; const uint32_t x = X[i];
if (x != 0) { if (x != 0) {
const int xy = x + Y[i]; const uint32_t xy = x + Y[i];
sumX += x; sumX += x;
retval -= VP8LFastSLog2(x); retval += VP8LFastSLog2(x);
sumXY += xy; sumXY += xy;
retval -= VP8LFastSLog2(xy); retval += VP8LFastSLog2(xy);
} else if (Y[i] != 0) { } else if (Y[i] != 0) {
sumXY += Y[i]; sumXY += Y[i];
retval -= VP8LFastSLog2(Y[i]); retval += VP8LFastSLog2(Y[i]);
} }
} }
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
return retval;
}
static uint64_t ShannonEntropy_C(const uint32_t* X, int n) {
int i;
uint64_t retval = 0;
uint32_t sumX = 0;
for (i = 0; i < n; ++i) {
const int x = X[i];
if (x != 0) {
sumX += x;
retval += VP8LFastSLog2(x);
}
}
retval = VP8LFastSLog2(sumX) - retval;
return retval; return retval;
} }
void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) { void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
entropy->entropy = 0.; entropy->entropy = 0;
entropy->sum = 0; entropy->sum = 0;
entropy->nonzeros = 0; entropy->nonzeros = 0;
entropy->max_val = 0; entropy->max_val = 0;
entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM; entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
} }
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
VP8LBitEntropy* const entropy) { VP8LBitEntropy* WEBP_RESTRICT const entropy) {
int i; int i;
VP8LBitEntropyInit(entropy); VP8LBitEntropyInit(entropy);
@ -440,18 +370,20 @@ void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
entropy->sum += array[i]; entropy->sum += array[i];
entropy->nonzero_code = i; entropy->nonzero_code = i;
++entropy->nonzeros; ++entropy->nonzeros;
entropy->entropy -= VP8LFastSLog2(array[i]); entropy->entropy += VP8LFastSLog2(array[i]);
if (entropy->max_val < array[i]) { if (entropy->max_val < array[i]) {
entropy->max_val = array[i]; entropy->max_val = array[i];
} }
} }
} }
entropy->entropy += VP8LFastSLog2(entropy->sum); entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;
} }
static WEBP_INLINE void GetEntropyUnrefinedHelper( static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { int* WEBP_RESTRICT const i_prev,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
const int streak = i - *i_prev; const int streak = i - *i_prev;
// Gather info for the bit entropy. // Gather info for the bit entropy.
@ -459,7 +391,7 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
bit_entropy->sum += (*val_prev) * streak; bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak; bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev; bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) { if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev; bit_entropy->max_val = *val_prev;
} }
@ -473,9 +405,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i; *i_prev = i;
} }
static void GetEntropyUnrefined_C(const uint32_t X[], int length, static void GetEntropyUnrefined_C(
VP8LBitEntropy* const bit_entropy, const uint32_t X[], int length,
VP8LStreaks* const stats) { VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int i; int i;
int i_prev = 0; int i_prev = 0;
uint32_t x_prev = X[0]; uint32_t x_prev = X[0];
@ -491,14 +424,13 @@ static void GetEntropyUnrefined_C(const uint32_t X[], int length,
} }
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
} }
static void GetCombinedEntropyUnrefined_C(const uint32_t X[], static void GetCombinedEntropyUnrefined_C(
const uint32_t Y[], const uint32_t X[], const uint32_t Y[], int length,
int length, VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* WEBP_RESTRICT const stats) {
VP8LStreaks* const stats) {
int i = 1; int i = 1;
int i_prev = 0; int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0]; uint32_t xy_prev = X[0] + Y[0];
@ -514,7 +446,7 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
} }
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats); GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -538,8 +470,8 @@ static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
return (int8_t)(v & 0xff); return (int8_t)(v & 0xff);
} }
void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
int num_pixels) { uint32_t* WEBP_RESTRICT data, int num_pixels) {
int i; int i;
for (i = 0; i < num_pixels; ++i) { for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i]; const uint32_t argb = data[i];
@ -575,9 +507,10 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff); return (new_blue & 0xff);
} }
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride, void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_red, int histo[]) { int green_to_red, uint32_t histo[]) {
while (tile_height-- > 0) { while (tile_height-- > 0) {
int x; int x;
for (x = 0; x < tile_width; ++x) { for (x = 0; x < tile_width; ++x) {
@ -587,10 +520,11 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
} }
} }
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int green_to_blue, int red_to_blue,
int histo[]) { uint32_t histo[]) {
while (tile_height-- > 0) { while (tile_height-- > 0) {
int x; int x;
for (x = 0; x < tile_width; ++x) { for (x = 0; x < tile_width; ++x) {
@ -614,8 +548,8 @@ static int VectorMismatch_C(const uint32_t* const array1,
} }
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
uint32_t* dst) { int width, int xbits, uint32_t* WEBP_RESTRICT dst) {
int x; int x;
if (xbits > 0) { if (xbits > 0) {
const int bit_depth = 1 << (3 - xbits); const int bit_depth = 1 << (3 - xbits);
@ -646,7 +580,8 @@ static uint32_t ExtraCost_C(const uint32_t* population, int length) {
return cost; return cost;
} }
static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, static uint32_t ExtraCostCombined_C(const uint32_t* WEBP_RESTRICT X,
const uint32_t* WEBP_RESTRICT Y,
int length) { int length) {
int i; int i;
uint32_t cost = X[4] + Y[4] + X[5] + Y[5]; uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
@ -661,13 +596,15 @@ static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out, static void AddVector_C(const uint32_t* WEBP_RESTRICT a,
int size) { const uint32_t* WEBP_RESTRICT b,
uint32_t* WEBP_RESTRICT out, int size) {
int i; int i;
for (i = 0; i < size; ++i) out[i] = a[i] + b[i]; for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
} }
static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) { static void AddVectorEq_C(const uint32_t* WEBP_RESTRICT a,
uint32_t* WEBP_RESTRICT out, int size) {
int i; int i;
for (i = 0; i < size; ++i) out[i] += a[i]; for (i = 0; i < size; ++i) out[i] += a[i];
} }
@ -696,8 +633,9 @@ static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
} \ } \
} while (0) } while (0)
void VP8LHistogramAdd(const VP8LHistogram* const a, void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
const VP8LHistogram* const b, VP8LHistogram* const out) { const VP8LHistogram* WEBP_RESTRICT const b,
VP8LHistogram* WEBP_RESTRICT const out) {
int i; int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_); assert(a->palette_code_bits_ == b->palette_code_bits_);
@ -727,14 +665,14 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
// Image transforms. // Image transforms.
static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper, static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK); for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
(void)upper; (void)upper;
} }
static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper, static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]); for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
(void)upper; (void)upper;
@ -745,7 +683,8 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \ #define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \ static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
const uint32_t* upper, \ const uint32_t* upper, \
int num_pixels, uint32_t* out) { \ int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int x; \ int x; \
assert(upper != NULL); \ assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \ for (x = 0; x < num_pixels; ++x) { \
@ -778,11 +717,12 @@ VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms; VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
VP8LFastLog2SlowFunc VP8LFastLog2Slow; VP8LFastLog2SlowFunc VP8LFastLog2Slow;
VP8LFastLog2SlowFunc VP8LFastSLog2Slow; VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
VP8LCostFunc VP8LExtraCost; VP8LCostFunc VP8LExtraCost;
VP8LCostCombinedFunc VP8LExtraCostCombined; VP8LCostCombinedFunc VP8LExtraCostCombined;
VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
VP8LShannonEntropyFunc VP8LShannonEntropy;
VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined; VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
@ -822,6 +762,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
VP8LExtraCost = ExtraCost_C; VP8LExtraCost = ExtraCost_C;
VP8LExtraCostCombined = ExtraCostCombined_C; VP8LExtraCostCombined = ExtraCostCombined_C;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C; VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
VP8LShannonEntropy = ShannonEntropy_C;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C; VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C; VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
@ -911,6 +852,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
assert(VP8LExtraCost != NULL); assert(VP8LExtraCost != NULL);
assert(VP8LExtraCostCombined != NULL); assert(VP8LExtraCostCombined != NULL);
assert(VP8LCombinedShannonEntropy != NULL); assert(VP8LCombinedShannonEntropy != NULL);
assert(VP8LShannonEntropy != NULL);
assert(VP8LGetEntropyUnrefined != NULL); assert(VP8LGetEntropyUnrefined != NULL);
assert(VP8LGetCombinedEntropyUnrefined != NULL); assert(VP8LGetCombinedEntropyUnrefined != NULL);
assert(VP8LAddVector != NULL); assert(VP8LAddVector != NULL);

View File

@ -23,12 +23,12 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
static float FastSLog2Slow_MIPS32(uint32_t v) { static uint64_t FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX); assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) { if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction; uint32_t log_cnt, y;
uint64_t correction;
const int c24 = 24; const int c24 = 24;
const float v_f = (float)v;
uint32_t temp; uint32_t temp;
// Xf = 256 = 2^8 // Xf = 256 = 2^8
@ -49,22 +49,23 @@ static float FastSLog2Slow_MIPS32(uint32_t v) {
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so // The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
correction = (23 * (v & (y - 1))) >> 4; correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));
return v_f * (kLog2Table[temp] + log_cnt) + correction; return (uint64_t)v * (kLog2Table[temp] +
((uint64_t)log_cnt << LOG_2_PRECISION_BITS)) +
correction;
} else { } else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v)); return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);
} }
} }
static float FastLog2Slow_MIPS32(uint32_t v) { static uint32_t FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX); assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) { if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y; uint32_t log_cnt, y;
const int c24 = 24; const int c24 = 24;
double log_2; uint32_t log_2;
uint32_t temp; uint32_t temp;
__asm__ volatile( __asm__ volatile(
@ -78,17 +79,16 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
: [c24]"r"(c24), [v]"r"(v) : [c24]"r"(c24), [v]"r"(v)
); );
log_2 = kLog2Table[temp] + log_cnt; log_2 = kLog2Table[temp] + (log_cnt << LOG_2_PRECISION_BITS);
if (v >= APPROX_LOG_MAX) { if (v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only // Since the division is still expensive, add this correction factor only
// for large values of 'v'. // for large values of 'v'.
const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));
const uint32_t correction = (23 * (v & (y - 1))) >> 4; log_2 += (uint32_t)DivRound(correction, v);
log_2 += (double)correction / v;
} }
return (float)log_2; return log_2;
} else { } else {
return (float)(LOG_2_RECIPROCAL * log((double)v)); return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);
} }
} }
@ -149,8 +149,9 @@ static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
// pY += 2; // pY += 2;
// } // }
// return cost; // return cost;
static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X, static uint32_t ExtraCostCombined_MIPS32(const uint32_t* WEBP_RESTRICT const X,
const uint32_t* const Y, int length) { const uint32_t* WEBP_RESTRICT const Y,
int length) {
int i, temp0, temp1, temp2, temp3; int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4]; const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4]; const uint32_t* pY = &Y[4];
@ -215,8 +216,10 @@ static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
// Returns the various RLE counts // Returns the various RLE counts
static WEBP_INLINE void GetEntropyUnrefinedHelper( static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { int* WEBP_RESTRICT const i_prev,
VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int* const pstreaks = &stats->streaks[0][0]; int* const pstreaks = &stats->streaks[0][0];
int* const pcnts = &stats->counts[0]; int* const pcnts = &stats->counts[0];
int temp0, temp1, temp2, temp3; int temp0, temp1, temp2, temp3;
@ -227,7 +230,7 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
bit_entropy->sum += (*val_prev) * streak; bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak; bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev; bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) { if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev; bit_entropy->max_val = *val_prev;
} }
@ -241,9 +244,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i; *i_prev = i;
} }
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, static void GetEntropyUnrefined_MIPS32(
VP8LBitEntropy* const bit_entropy, const uint32_t X[], int length,
VP8LStreaks* const stats) { VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
VP8LStreaks* WEBP_RESTRICT const stats) {
int i; int i;
int i_prev = 0; int i_prev = 0;
uint32_t x_prev = X[0]; uint32_t x_prev = X[0];
@ -259,14 +263,13 @@ static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
} }
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
} }
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], static void GetCombinedEntropyUnrefined_MIPS32(
const uint32_t Y[], const uint32_t X[], const uint32_t Y[], int length,
int length, VP8LBitEntropy* WEBP_RESTRICT const entropy,
VP8LBitEntropy* const entropy, VP8LStreaks* WEBP_RESTRICT const stats) {
VP8LStreaks* const stats) {
int i = 1; int i = 1;
int i_prev = 0; int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0]; uint32_t xy_prev = X[0] + Y[0];
@ -282,7 +285,7 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
} }
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats); GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
entropy->entropy += VP8LFastSLog2(entropy->sum); entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;
} }
#define ASM_START \ #define ASM_START \
@ -344,8 +347,9 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
ASM_END_COMMON_0 \ ASM_END_COMMON_0 \
ASM_END_COMMON_1 ASM_END_COMMON_1
static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa,
uint32_t* pout, int size) { const uint32_t* WEBP_RESTRICT pb,
uint32_t* WEBP_RESTRICT pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int end = ((size) / 4) * 4; const int end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end; const uint32_t* const LoopEnd = pa + end;
@ -356,7 +360,8 @@ static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i]; for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
} }
static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa,
uint32_t* WEBP_RESTRICT pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int end = ((size) / 4) * 4; const int end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end; const uint32_t* const LoopEnd = pa + end;

View File

@ -78,8 +78,9 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
return (uint32_t)((int)(color_pred) * color) >> 5; return (uint32_t)((int)(color_pred) * color) >> 5;
} }
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m, static void TransformColor_MIPSdspR2(
uint32_t* data, int num_pixels) { const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT data,
int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5; int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1; uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_; const uint32_t G_to_R = m->green_to_red_;
@ -171,13 +172,10 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff); return (new_blue & 0xff);
} }
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb, static void CollectColorBlueTransforms_MIPSdspR2(
int stride, const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_width, int tile_height,
int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]) {
int green_to_blue,
int red_to_blue,
int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff); const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff); const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu; const uint32_t mask = 0xff00ffu;
@ -225,12 +223,9 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
return (new_red & 0xff); return (new_red & 0xff);
} }
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb, static void CollectColorRedTransforms_MIPSdspR2(
int stride, const uint32_t* WEBP_RESTRICT argb, int stride,
int tile_width, int tile_width, int tile_height, int green_to_red, uint32_t histo[]) {
int tile_height,
int green_to_red,
int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff); const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) { while (tile_height-- > 0) {
int x; int x;

View File

@ -48,8 +48,8 @@
dst = VSHF_UB(src, t0, mask1); \ dst = VSHF_UB(src, t0, mask1); \
} while (0) } while (0)
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data, static void TransformColor_MSA(const VP8LMultipliers* WEBP_RESTRICT const m,
int num_pixels) { uint32_t* WEBP_RESTRICT data, int num_pixels) {
v16u8 src0, dst0; v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16)); (m->green_to_red_ << 16));

View File

@ -72,8 +72,9 @@ static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Color Transform // Color Transform
static void TransformColor_NEON(const VP8LMultipliers* const m, static void TransformColor_NEON(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* argb_data, int num_pixels) { uint32_t* WEBP_RESTRICT argb_data,
int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6. // sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6) #define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = { const int16_t rb[8] = {

View File

@ -49,8 +49,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
#define MK_CST_16(HI, LO) \ #define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void TransformColor_SSE2(const VP8LMultipliers* const m, static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* argb_data, int num_pixels) { uint32_t* WEBP_RESTRICT argb_data,
int num_pixels) {
const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_), const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
CST_5b(m->green_to_blue_)); CST_5b(m->green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
@ -79,10 +80,11 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
#define SPAN 8 #define SPAN 8
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride, static void CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int green_to_blue, int red_to_blue,
int histo[]) { uint32_t histo[]) {
const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0); const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue)); const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
@ -126,9 +128,10 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
} }
} }
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride, static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_red, int histo[]) { int green_to_red, uint32_t histo[]) {
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red)); const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi32(0xff); const __m128i mask = _mm_set1_epi32(0xff);
@ -172,75 +175,113 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size). // that's ok since the histogram values are less than 1<<28 (max picture size).
#define LINE_SIZE 16 // 8 or 16 static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out, const uint32_t* WEBP_RESTRICT b,
int size) { uint32_t* WEBP_RESTRICT out, int size) {
int i; int i = 0;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { int aligned_size = size & ~15;
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
// 2). See the usage in VP8LHistogramAdd().
assert(size >= 16);
assert(size % 2 == 0);
do {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]); const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]); const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif i += 16;
} while (i != aligned_size);
if ((size & 8) != 0) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
i += 8;
} }
for (; i < size; ++i) {
out[i] = a[i] + b[i]; size &= 7;
if (size == 4) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
} else if (size == 2) {
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]);
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
} }
} }
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
int i; uint32_t* WEBP_RESTRICT out, int size) {
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { int i = 0;
int aligned_size = size & ~15;
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
// 2). See the usage in VP8LHistogramAdd().
assert(size >= 16);
assert(size % 2 == 0);
do {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]); const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif i += 16;
} while (i != aligned_size);
if ((size & 8) != 0) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
i += 8;
} }
for (; i < size; ++i) {
out[i] += a[i]; size &= 7;
if (size == 4) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]);
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
} else if (size == 2) {
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]);
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
} }
} }
#undef LINE_SIZE
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Entropy // Entropy
// TODO(https://crbug.com/webp/499): this function produces different results #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// from the C code due to use of double/float resulting in output differences
// when compared to -noasm.
#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) { static uint64_t CombinedShannonEntropy_SSE2(const uint32_t X[256],
const uint32_t Y[256]) {
int i; int i;
float retval = 0.f; uint64_t retval = 0;
int sumX = 0, sumXY = 0; uint32_t sumX = 0, sumXY = 0;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
for (i = 0; i < 256; i += 16) { for (i = 0; i < 256; i += 16) {
@ -260,19 +301,19 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx; int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
while (my) { while (my) {
const int32_t j = BitsCtz(my); const int32_t j = BitsCtz(my);
int xy; uint32_t xy;
if ((mx >> j) & 1) { if ((mx >> j) & 1) {
const int x = X[i + j]; const int x = X[i + j];
sumXY += x; sumXY += x;
retval -= VP8LFastSLog2(x); retval += VP8LFastSLog2(x);
} }
xy = X[i + j] + Y[i + j]; xy = X[i + j] + Y[i + j];
sumX += xy; sumX += xy;
retval -= VP8LFastSLog2(xy); retval += VP8LFastSLog2(xy);
my &= my - 1; my &= my - 1;
} }
} }
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
return retval; return retval;
} }
@ -335,8 +376,9 @@ static int VectorMismatch_SSE2(const uint32_t* const array1,
} }
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, static void BundleColorMap_SSE2(const uint8_t* WEBP_RESTRICT const row,
uint32_t* dst) { int width, int xbits,
uint32_t* WEBP_RESTRICT dst) {
int x; int x;
assert(xbits >= 0); assert(xbits >= 0);
assert(xbits <= 3); assert(xbits <= 3);
@ -425,7 +467,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
// Predictor0: ARGB_BLACK. // Predictor0: ARGB_BLACK.
static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -442,7 +484,8 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_1(X, IN) \ #define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorSub##X##_SSE2(const uint32_t* const in, \ static void PredictorSub##X##_SSE2(const uint32_t* const in, \
const uint32_t* const upper, \ const uint32_t* const upper, \
int num_pixels, uint32_t* const out) { \ int num_pixels, \
uint32_t* WEBP_RESTRICT const out) { \
int i; \ int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \ for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
@ -464,7 +507,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL
// Predictor5: avg2(avg2(L, TR), T) // Predictor5: avg2(avg2(L, TR), T)
static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -484,7 +527,8 @@ static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_2(X, A, B) \ #define GENERATE_PREDICTOR_2(X, A, B) \
static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \ int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int i; \ int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \ for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
@ -508,7 +552,7 @@ GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)
// Predictor10: avg(avg(L,TL), avg(T, TR)). // Predictor10: avg(avg(L,TL), avg(T, TR)).
static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -543,7 +587,7 @@ static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
} }
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -569,7 +613,7 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
// Predictor12: ClampedSubSubtractFull. // Predictor12: ClampedSubSubtractFull.
static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -598,7 +642,7 @@ static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
// Predictors13: ClampedAddSubtractHalf // Predictors13: ClampedAddSubtractHalf
static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 2 <= num_pixels; i += 2) { for (i = 0; i + 2 <= num_pixels; i += 2) {

View File

@ -44,8 +44,9 @@ static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
return HorizontalSum_SSE41(cost); return HorizontalSum_SSE41(cost);
} }
static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a, static uint32_t ExtraCostCombined_SSE41(const uint32_t* WEBP_RESTRICT const a,
const uint32_t* const b, int length) { const uint32_t* WEBP_RESTRICT const b,
int length) {
int i; int i;
__m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]), __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
_mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4])); _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
@ -95,10 +96,11 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
#define MK_CST_16(HI, LO) \ #define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int green_to_blue, int red_to_blue,
int histo[]) { uint32_t histo[]) {
const __m128i mult = const __m128i mult =
MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue)); MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
const __m128i perm = const __m128i perm =
@ -141,10 +143,11 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
} }
} }
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
int stride,
int tile_width, int tile_height, int tile_width, int tile_height,
int green_to_red, int histo[]) { int green_to_red,
uint32_t histo[]) {
const __m128i mult = MK_CST_16(0, CST_5b(green_to_red)); const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x0000ff00); const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
if (tile_width >= 4) { if (tile_width >= 4) {

View File

@ -26,8 +26,8 @@
#if !defined(WORK_AROUND_GCC) #if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least. // gcc-4.8.x at least.
static void ConvertBGRAToRGBA_NEON(const uint32_t* src, static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~15); const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) { for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
} }
static void ConvertBGRAToBGR_NEON(const uint32_t* src, static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~15); const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) { for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -53,8 +53,8 @@ static void ConvertBGRAToBGR_NEON(const uint32_t* src,
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
} }
static void ConvertBGRAToRGB_NEON(const uint32_t* src, static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~15); const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) { for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -71,8 +71,8 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src,
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA_NEON(const uint32_t* src, static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~1); const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) { for (; src < end; src += 2) {
@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
{ 21, 22, 24, 25, 26, 28, 29, 30 } { 21, 22, 24, 25, 26, 28, 29, 30 }
}; };
static void ConvertBGRAToBGR_NEON(const uint32_t* src, static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~7); const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
{ 21, 20, 26, 25, 24, 30, 29, 28 } { 21, 20, 26, 25, 24, 30, 29, 28 }
}; };
static void ConvertBGRAToRGB_NEON(const uint32_t* src, static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const uint32_t* const end = src + (num_pixels & ~7); const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@ -209,7 +209,7 @@ static uint32_t Predictor13_NEON(const uint32_t* const left,
// Predictor0: ARGB_BLACK. // Predictor0: ARGB_BLACK.
static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK)); const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -222,7 +222,7 @@ static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
// Predictor1: left. // Predictor1: left.
static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const uint8x16_t zero = LOADQ_U32_AS_U8(0); const uint8x16_t zero = LOADQ_U32_AS_U8(0);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -248,7 +248,7 @@ static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_1(X, IN) \ #define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_NEON(const uint32_t* in, \ static void PredictorAdd##X##_NEON(const uint32_t* in, \
const uint32_t* upper, int num_pixels, \ const uint32_t* upper, int num_pixels, \
uint32_t* out) { \ uint32_t* WEBP_RESTRICT out) { \
int i; \ int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \ for (i = 0; i + 4 <= num_pixels; i += 4) { \
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
@ -276,7 +276,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1])
} while (0) } while (0)
static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -301,7 +301,7 @@ static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
// Predictor6: average(left, TL) // Predictor6: average(left, TL)
static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -317,7 +317,7 @@ static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
// Predictor7: average(left, T) // Predictor7: average(left, T)
static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -335,7 +335,7 @@ static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
#define GENERATE_PREDICTOR_2(X, IN) \ #define GENERATE_PREDICTOR_2(X, IN) \
static void PredictorAdd##X##_NEON(const uint32_t* in, \ static void PredictorAdd##X##_NEON(const uint32_t* in, \
const uint32_t* upper, int num_pixels, \ const uint32_t* upper, int num_pixels, \
uint32_t* out) { \ uint32_t* WEBP_RESTRICT out) { \
int i; \ int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \ for (i = 0; i + 4 <= num_pixels; i += 4) { \
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
@ -363,7 +363,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
} while (0) } while (0)
static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -394,7 +394,7 @@ static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
} while (0) } while (0)
static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -427,7 +427,7 @@ static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
} while (0) } while (0)
static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1])); uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -468,7 +468,7 @@ static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
} while (0) } while (0)
static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper, static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {

View File

@ -186,7 +186,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left,
// Predictor0: ARGB_BLACK. // Predictor0: ARGB_BLACK.
static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -202,7 +202,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
// Predictor1: left. // Predictor1: left.
static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
__m128i prev = _mm_set1_epi32((int)out[-1]); __m128i prev = _mm_set1_epi32((int)out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -230,7 +230,8 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
// per 8 bit channel. // per 8 bit channel.
#define GENERATE_PREDICTOR_1(X, IN) \ #define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \ int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int i; \ int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \ for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
@ -259,7 +260,8 @@ GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
#define GENERATE_PREDICTOR_2(X, IN) \ #define GENERATE_PREDICTOR_2(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \ int num_pixels, \
uint32_t* WEBP_RESTRICT out) { \
int i; \ int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \ for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \ const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
@ -297,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
} while (0) } while (0)
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
__m128i L = _mm_cvtsi32_si128((int)out[-1]); __m128i L = _mm_cvtsi32_si128((int)out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -344,7 +346,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
} while (0) } while (0)
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
__m128i pa; __m128i pa;
__m128i L = _mm_cvtsi32_si128((int)out[-1]); __m128i L = _mm_cvtsi32_si128((int)out[-1]);
@ -395,7 +397,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
} while (0) } while (0)
static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]); const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
@ -490,8 +492,8 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Color-space conversion functions // Color-space conversion functions
static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,
uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i* in = (const __m128i*)src; const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst; __m128i* out = (__m128i*)dst;
@ -526,8 +528,8 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
} }
} }
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff); const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
const __m128i* in = (const __m128i*)src; const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst; __m128i* out = (__m128i*)dst;
@ -554,8 +556,9 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
} }
} }
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels,
uint8_t* WEBP_RESTRICT dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
const __m128i* in = (const __m128i*)src; const __m128i* in = (const __m128i*)src;
@ -590,8 +593,9 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
} }
} }
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels,
uint8_t* WEBP_RESTRICT dst) {
const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0); const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8); const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07); const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@ -631,8 +635,8 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
} }
} }
static void ConvertBGRAToBGR_SSE2(const uint32_t* src, static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src; const __m128i* in = (const __m128i*)src;

View File

@ -77,8 +77,8 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
} \ } \
} while (0) } while (0)
static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels, static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src,
uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i* in = (const __m128i*)src; const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst; __m128i* out = (__m128i*)dst;
const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
@ -95,8 +95,8 @@ static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
} }
} }
static void ConvertBGRAToBGR_SSE41(const uint32_t* src, static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* WEBP_RESTRICT dst) {
const __m128i* in = (const __m128i*)src; const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst; __m128i* out = (__m128i*)dst;
const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,

View File

@ -26,8 +26,8 @@
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Row import // Row import
void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, void WebPRescalerImportRowExpand_C(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* src) { const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels; const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel; int channel;
@ -59,8 +59,8 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
} }
} }
void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk, void WebPRescalerImportRowShrink_C(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* src) { const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels; const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel; int channel;
@ -158,7 +158,8 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Main entry calls // Main entry calls
void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) { void WebPRescalerImportRow(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* WEBP_RESTRICT src) {
assert(!WebPRescalerInputDone(wrk)); assert(!WebPRescalerInputDone(wrk));
if (!wrk->x_expand) { if (!wrk->x_expand) {
WebPRescalerImportRowShrink(wrk, src); WebPRescalerImportRowShrink(wrk, src);

View File

@ -21,8 +21,8 @@
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Row import // Row import
static void ImportRowShrink_MIPS32(WebPRescaler* const wrk, static void ImportRowShrink_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* src) { const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels; const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale; const int fx_scale = wrk->fx_scale;
@ -81,8 +81,8 @@ static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
} }
} }
static void ImportRowExpand_MIPS32(WebPRescaler* const wrk, static void ImportRowExpand_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* src) { const uint8_t* WEBP_RESTRICT src) {
const int x_stride = wrk->num_channels; const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add; const int x_add = wrk->x_add;

View File

@ -114,9 +114,9 @@
dst = __msa_copy_s_w((v4i32)t0, 0); \ dst = __msa_copy_s_w((v4i32)t0, 0); \
} while (0) } while (0)
static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, static WEBP_INLINE void ExportRowExpand_0(
int length, const uint32_t* WEBP_RESTRICT frow, uint8_t* WEBP_RESTRICT dst, int length,
WebPRescaler* const wrk) { WebPRescaler* WEBP_RESTRICT const wrk) {
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 }; const v4i32 zero = { 0 };
@ -171,9 +171,10 @@ static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
} }
} }
static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow, static WEBP_INLINE void ExportRowExpand_1(
uint8_t* dst, int length, const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
WebPRescaler* const wrk) { uint8_t* WEBP_RESTRICT dst, int length,
WebPRescaler* WEBP_RESTRICT const wrk) {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
const v4i32 B1 = __msa_fill_w(B); const v4i32 B1 = __msa_fill_w(B);
@ -262,10 +263,10 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
} }
#if 0 // disabled for now. TODO(skal): make match the C-code #if 0 // disabled for now. TODO(skal): make match the C-code
static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, static WEBP_INLINE void ExportRowShrink_0(
uint8_t* dst, int length, const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
const uint32_t yscale, uint8_t* WEBP_RESTRICT dst, int length, const uint32_t yscale,
WebPRescaler* const wrk) { WebPRescaler* WEBP_RESTRICT const wrk) {
const v4u32 y_scale = (v4u32)__msa_fill_w(yscale); const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale); const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
@ -348,9 +349,9 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
} }
} }
static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst, static WEBP_INLINE void ExportRowShrink_1(
int length, uint32_t* WEBP_RESTRICT irow, uint8_t* WEBP_RESTRICT dst, int length,
WebPRescaler* const wrk) { WebPRescaler* WEBP_RESTRICT const wrk) {
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale); const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 }; const v4i32 zero = { 0 };

View File

@ -45,8 +45,8 @@
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work" #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif #endif
static uint32x4_t Interpolate_NEON(const rescaler_t* const frow, static uint32x4_t Interpolate_NEON(const rescaler_t* WEBP_RESTRICT const frow,
const rescaler_t* const irow, const rescaler_t* WEBP_RESTRICT const irow,
uint32_t A, uint32_t B) { uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0); LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0); LOAD_32x4(irow, B0);

View File

@ -43,8 +43,8 @@ static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
*out = _mm_unpacklo_epi8(A, zero); *out = _mm_unpacklo_epi8(A, zero);
} }
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, static void RescalerImportRowExpand_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* src) { const uint8_t* WEBP_RESTRICT src) {
rescaler_t* frow = wrk->frow; rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add; const int x_add = wrk->x_add;
@ -109,8 +109,8 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
assert(accum == 0); assert(accum == 0);
} }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, static void RescalerImportRowShrink_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
const uint8_t* src) { const uint8_t* WEBP_RESTRICT src) {
const int x_sub = wrk->x_sub; const int x_sub = wrk->x_sub;
int accum = 0; int accum = 0;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
@ -168,12 +168,10 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
// Row export // Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3] // load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src, static WEBP_INLINE void LoadDispatchAndMult_SSE2(
const __m128i* const mult, const rescaler_t* WEBP_RESTRICT const src, const __m128i* const mult,
__m128i* const out0, __m128i* const out0, __m128i* const out1, __m128i* const out2,
__m128i* const out1, __m128i* const out3) {
__m128i* const out2,
__m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0)); const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32); const __m128i A2 = _mm_srli_epi64(A0, 32);

View File

@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
#define LOAD_UV(u, v) ((u) | ((v) << 16)) #define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \ int x; \
const int last_pixel_pair = (len - 1) >> 1; \ const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
#if !defined(FANCY_UPSAMPLING) #if !defined(FANCY_UPSAMPLING)
#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \ #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* bot_u, const uint8_t* bot_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bot_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT bot_u, \
const uint8_t* WEBP_RESTRICT bot_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
const int half_len = len >> 1; \ const int half_len = len >> 1; \
int x; \ int x; \
assert(top_dst != NULL); \ assert(top_dst != NULL); \
@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
// YUV444 converter // YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len); \ const uint8_t* WEBP_RESTRICT u, \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* dst, int len) { \ uint8_t* WEBP_RESTRICT dst, int len); \
void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
} }

View File

@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#define LOAD_UV(u, v) ((u) | ((v) << 16)) #define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \ int x; \
const int last_pixel_pair = (len - 1) >> 1; \ const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
// YUV444 converter // YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len) { \ const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
} }

View File

@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
} }
#if !defined(WEBP_REDUCE_CSP) #if !defined(WEBP_REDUCE_CSP)
static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
while (length >= 16) { while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B); CALC_RGB16(y, u, v, R, G, B);
@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
while (length >= 16) { while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B); CALC_RGB16(y, u, v, R, G, B);
@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
} }
#endif // WEBP_REDUCE_CSP #endif // WEBP_REDUCE_CSP
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) { while (length >= 16) {
@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) { while (length >= 16) {
@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
} }
#if !defined(WEBP_REDUCE_CSP) #if !defined(WEBP_REDUCE_CSP)
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) { while (length >= 16) {
@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1; v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) { while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1) #if (WEBP_SWAP_16BIT_CSP == 1)
@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1; v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) { while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1) #if (WEBP_SWAP_16BIT_CSP == 1)
@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} while (0) } while (0)
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bot_dst, int len) \ const uint8_t* WEBP_RESTRICT top_v, \
{ \ const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
int size = (len - 1) >> 1; \ int size = (len - 1) >> 1; \
uint8_t temp_u[64]; \ uint8_t temp_u[64]; \
uint8_t temp_v[64]; \ uint8_t temp_v[64]; \

View File

@ -58,8 +58,9 @@
} while (0) } while (0)
// Turn the macro into a function for reducing code-size when non-critical // Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2, static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1,
uint8_t* out) { const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_16PIXELS(r1, r2, out); UPSAMPLE_16PIXELS(r1, r2, out);
} }
@ -189,57 +190,61 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
} \ } \
} }
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \ #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
int block; \ const uint8_t* WEBP_RESTRICT cur_u, \
/* 16 byte aligned array to cache reconstructed u and v */ \ const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t uv_buf[2 * 32 + 15]; \ uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
const int uv_len = (len + 1) >> 1; \ int block; \
/* 9 pixels must be read-able for each block */ \ /* 16 byte aligned array to cache reconstructed u and v */ \
const int num_blocks = (uv_len - 1) >> 3; \ uint8_t uv_buf[2 * 32 + 15]; \
const int leftover = uv_len - num_blocks * 8; \ uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \
const int last_pos = 1 + 16 * num_blocks; \ const int uv_len = (len + 1) >> 1; \
\ /* 9 pixels must be read-able for each block */ \
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ const int num_blocks = (uv_len - 1) >> 3; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ const int leftover = uv_len - num_blocks * 8; \
\ const int last_pos = 1 + 16 * num_blocks; \
const int16x4_t coeff1 = vld1_s16(kCoeffs1); \ \
const int16x8_t R_Rounder = vdupq_n_s16(-14234); \ const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int16x8_t G_Rounder = vdupq_n_s16(8708); \ const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
const int16x8_t B_Rounder = vdupq_n_s16(-17685); \ \
\ const int16x4_t coeff1 = vld1_s16(kCoeffs1); \
/* Treat the first pixel in regular way */ \ const int16x8_t R_Rounder = vdupq_n_s16(-14234); \
assert(top_y != NULL); \ const int16x8_t G_Rounder = vdupq_n_s16(8708); \
{ \ const int16x8_t B_Rounder = vdupq_n_s16(-17685); \
const int u0 = (top_u[0] + u_diag) >> 1; \ \
const int v0 = (top_v[0] + v_diag) >> 1; \ /* Treat the first pixel in regular way */ \
VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \ assert(top_y != NULL); \
} \ { \
if (bottom_y != NULL) { \ const int u0 = (top_u[0] + u_diag) >> 1; \
const int u0 = (cur_u[0] + u_diag) >> 1; \ const int v0 = (top_v[0] + v_diag) >> 1; \
const int v0 = (cur_v[0] + v_diag) >> 1; \ VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \
VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \ } \
} \ if (bottom_y != NULL) { \
\ const int u0 = (cur_u[0] + u_diag) >> 1; \
for (block = 0; block < num_blocks; ++block) { \ const int v0 = (cur_v[0] + v_diag) >> 1; \
UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \ VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \
UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \ } \
CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \ \
top_dst, bottom_dst, 16 * block + 1, 16); \ for (block = 0; block < num_blocks; ++block) { \
top_u += 8; \ UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \
cur_u += 8; \ UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \
top_v += 8; \ CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \
cur_v += 8; \ top_dst, bottom_dst, 16 * block + 1, 16); \
} \ top_u += 8; \
\ cur_u += 8; \
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \ top_v += 8; \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \ cur_v += 8; \
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \ } \
top_dst, bottom_dst, last_pos, len - last_pos); \ \
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, last_pos, len - last_pos); \
} }
// NEON variants of the fancy upsampler. // NEON variants of the fancy upsampler.

View File

@ -88,8 +88,9 @@
} while (0) } while (0)
// Turn the macro into a function for reducing code-size when non-critical // Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1,
uint8_t* const out) { const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out); UPSAMPLE_32PIXELS(r1, r2, out);
} }
@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
} while (0) } while (0)
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \ int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \ /* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void); extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len); \ const uint8_t* WEBP_RESTRICT u, \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* dst, int len) { \ uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
const int max_len = len & ~31; \ const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \ for (i = 0; i < max_len; i += 32) { \

View File

@ -90,8 +90,9 @@
} while (0) } while (0)
// Turn the macro into a function for reducing code-size when non-critical // Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1,
uint8_t* const out) { const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out); UPSAMPLE_32PIXELS(r1, r2, out);
} }
@ -116,14 +117,18 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
} while (0) } while (0)
#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \ int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \ /* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \
uint8_t* const r_v = r_u + 32; \ uint8_t* const r_v = r_u + 32; \
\ \
assert(top_y != NULL); \ assert(top_y != NULL); \
@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE41(void); extern void WebPInitYUV444ConvertersSSE41(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len); \ const uint8_t* WEBP_RESTRICT u, \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* dst, int len) { \ uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
const int max_len = len & ~31; \ const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \ for (i = 0; i < max_len; i += 32) { \

View File

@ -20,9 +20,10 @@
// Plain-C version // Plain-C version
#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT u, \
uint8_t* dst, int len) { \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \ const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \ while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \ FUNC(y[0], u[0], v[0], dst); \
@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2)
#undef ROW_FUNC #undef ROW_FUNC
// Main call for processing a plane with a WebPSamplerRowFunc function: // Main call for processing a plane with a WebPSamplerRowFunc function:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int dst_stride, const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func) { int width, int height, WebPSamplerRowFunc func) {
int j; int j;
for (j = 0; j < height; ++j) { for (j = 0; j < height; ++j) {
@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// ARGB -> YUV converters // ARGB -> YUV converters
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
int i; int i;
for (i = 0; i < width; ++i) { for (i = 0; i < width; ++i) {
const uint32_t p = argb[i]; const uint32_t p = argb[i];
@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
} }
} }
void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) { int src_width, int do_store) {
// No rounding. Last pixel is dealt with separately. // No rounding. Last pixel is dealt with separately.
const int uv_width = src_width >> 1; const int uv_width = src_width >> 1;
@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) { static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
int i; int i;
for (i = 0; i < width; ++i, rgb += 3) { for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
} }
} }
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) { static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
int i; int i;
for (i = 0; i < width; ++i, bgr += 3) { for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
} }
} }
void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* u, uint8_t* v, int width) { uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int width) {
int i; int i;
for (i = 0; i < width; i += 1, rgb += 4) { for (i = 0; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2]; const int r = rgb[0], g = rgb[1], b = rgb[2];
@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* u, uint8_t* v, int width); uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int src_width, int do_store); int src_width, int do_store);
extern void WebPInitConvertARGBToYUVSSE2(void); extern void WebPInitConvertARGBToYUVSSE2(void);

View File

@ -11,15 +11,15 @@
// //
// The exact naming is Y'CbCr, following the ITU-R BT.601 standard. // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
// More information at: https://en.wikipedia.org/wiki/YCbCr // More information at: https://en.wikipedia.org/wiki/YCbCr
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16 // Y = 0.2568 * R + 0.5041 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128 // U = -0.1482 * R - 0.2910 * G + 0.4392 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128 // V = 0.4392 * R - 0.3678 * G - 0.0714 * B + 128
// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX). // We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
// //
// For the Y'CbCr to RGB conversion, the BT.601 specification reads: // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
// R = 1.164 * (Y-16) + 1.596 * (V-128) // R = 1.164 * (Y-16) + 1.596 * (V-128)
// G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128) // G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.392 * (U-128)
// B = 1.164 * (Y-16) + 2.018 * (U-128) // B = 1.164 * (Y-16) + 2.017 * (U-128)
// where Y is in the [16,235] range, and U/V in the [16,240] range. // where Y is in the [16,235] range, and U/V in the [16,240] range.
// //
// The fixed-point implementation used here is: // The fixed-point implementation used here is:
@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#if defined(WEBP_USE_SSE2) #if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst); const uint8_t* WEBP_RESTRICT u,
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT v,
uint8_t* dst); uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst); const uint8_t* WEBP_RESTRICT u,
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT v,
uint8_t* dst); uint8_t* WEBP_RESTRICT dst);
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst); const uint8_t* WEBP_RESTRICT u,
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* WEBP_RESTRICT v,
const uint8_t* v, uint8_t* dst); uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst); const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
#endif // WEBP_USE_SSE2 #endif // WEBP_USE_SSE2
@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
#if defined(WEBP_USE_SSE41) #if defined(WEBP_USE_SSE41)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst); const uint8_t* WEBP_RESTRICT u,
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT v,
uint8_t* dst); uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
#endif // WEBP_USE_SSE41 #endif // WEBP_USE_SSE41

View File

@ -22,9 +22,10 @@
// simple point-sampling // simple point-sampling
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \ #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT u, \
uint8_t* dst, int len) { \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i, r, g, b; \ int i, r, g, b; \
int temp0, temp1, temp2, temp3, temp4; \ int temp0, temp1, temp2, temp3, temp4; \
for (i = 0; i < (len >> 1); i++) { \ for (i = 0; i < (len >> 1); i++) { \

View File

@ -69,9 +69,10 @@
: "memory", "hi", "lo" \ : "memory", "hi", "lo" \
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \ #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT u, \
uint8_t* dst, int len) { \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
const int t_con_1 = 26149; \ const int t_con_1 = 26149; \

View File

@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
return vqmovn_u16(Y2); return vqmovn_u16(Y2);
} }
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
int i; int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8x3_t RGB = vld3_u8(rgb); const uint8x8x3_t RGB = vld3_u8(rgb);
@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
} }
} }
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
int i; int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8x3_t BGR = vld3_u8(bgr); const uint8x8x3_t BGR = vld3_u8(bgr);
@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
} }
} }
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
int i; int i;
for (i = 0; i + 8 <= width; i += 8) { for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \ MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
} while (0) } while (0)
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* u, uint8_t* v, int width) { uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
int i; int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) { for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb); const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
} }
} }
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) { int src_width, int do_store) {
int i; int i;
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {

View File

@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
} }
// Convert 32 samples of YUV444 to R/G/B // Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE2(const uint8_t* const y, static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* const u, const uint8_t* WEBP_RESTRICT const u,
const uint8_t* const v, const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G, __m128i* const R, __m128i* const G,
__m128i* const B) { __m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u), const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y,
} }
// Convert 32 samples of YUV420 to R/G/B // Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE2(const uint8_t* const y, static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* const u, const uint8_t* WEBP_RESTRICT const u,
const uint8_t* const v, const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G, __m128i* const R, __m128i* const G,
__m128i* const B) { __m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u), const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
const __m128i* const A, const __m128i* const A,
uint8_t* const dst) { uint8_t* WEBP_RESTRICT const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B); const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A); const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga); const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
} }
// Pack R/G/B/A results into 16b output. // Pack R/G/B/A results into 16b output.
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, static WEBP_INLINE void PackAndStore4444_SSE2(
const __m128i* const G, const __m128i* const R, const __m128i* const G, const __m128i* const B,
const __m128i* const B, const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
const __m128i* const A,
uint8_t* const dst) {
#if (WEBP_SWAP_16BIT_CSP == 0) #if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G); const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A); const __m128i ba0 = _mm_packus_epi16(*B, *A);
@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
uint8_t* const dst) { uint8_t* WEBP_RESTRICT const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B); const __m128i b0 = _mm_packus_epi16(*B, *B);
@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3, __m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) { uint8_t* WEBP_RESTRICT const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation, // The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values. // let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it // To pack, we will keep taking one every two 8b integer and move it
@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
_mm_storeu_si128((__m128i*)(rgb + 80), *in5); _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
} }
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n < 32; n += 8, dst += 32) { for (n = 0; n < 32; n += 8, dst += 32) {
@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
} }
} }
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n < 32; n += 8, dst += 32) { for (n = 0; n < 32; n += 8, dst += 32) {
@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
} }
} }
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n < 32; n += 8, dst += 32) { for (n = 0; n < 32; n += 8, dst += 32) {
@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
} }
} }
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n < 32; n += 8, dst += 16) { for (n = 0; n < 32; n += 8, dst += 16) {
@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
} }
} }
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
int n; int n;
for (n = 0; n < 32; n += 8, dst += 16) { for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B; __m128i R, G, B;
@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
} }
} }
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
} }
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions // Arbitrary-length row conversion functions
static void YuvToRgbaRow_SSE2(const uint8_t* y, static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) { for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
} }
} }
static void YuvToBgraRow_SSE2(const uint8_t* y, static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) { for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
} }
} }
static void YuvToArgbRow_SSE2(const uint8_t* y, static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255); const __m128i kAlpha = _mm_set1_epi16(255);
int n; int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) { for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
} }
} }
static void YuvToRgbRow_SSE2(const uint8_t* y, static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n; int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
} }
} }
static void YuvToBgrRow_SSE2(const uint8_t* y, static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n; int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order. // Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE2( static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) { const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6]; __m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
} }
// Convert 8 packed ARGB to r[], g[], b[] // Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb, static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
__m128i* const rgb /*in[6]*/) { const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0); __m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4); __m128i a1 = LOAD_16(argb + 4);
@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
#undef MK_CST_16 #undef MK_CST_16
#undef TRANSFORM #undef TRANSFORM
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) { for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
} }
} }
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) { for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
} }
} }
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) { static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~15; const int max_width = width & ~15;
int i; int i;
for (i = 0; i < max_width; i += 16) { for (i = 0; i < max_width; i += 16) {
@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A,
*out = _mm_packs_epi32(C, D); *out = _mm_packs_epi32(C, D);
} }
static void ConvertARGBToUV_SSE2(const uint32_t* argb, static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
uint8_t* u, uint8_t* v, uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) { int src_width, int do_store) {
const int max_width = src_width & ~31; const int max_width = src_width & ~31;
int i; int i;
@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
// Convert 16 packed ARGB 16b-values to r[], g[], b[] // Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
const uint16_t* const rgbx, const uint16_t* WEBP_RESTRICT const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) { __m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
*b = _mm_unpacklo_epi64(B1, B3); *b = _mm_unpacklo_epi64(B1, B3);
} }
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb, static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* u, uint8_t* v, int width) { uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
const int max_width = width & ~15; const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width; const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) { while (rgb < last_rgb) {

View File

@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
} }
// Convert 32 samples of YUV444 to R/G/B // Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE41(const uint8_t* const y, static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* const u, const uint8_t* WEBP_RESTRICT const u,
const uint8_t* const v, const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G, __m128i* const R, __m128i* const G,
__m128i* const B) { __m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u), const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y,
} }
// Convert 32 samples of YUV420 to R/G/B // Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE41(const uint8_t* const y, static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* const u, const uint8_t* WEBP_RESTRICT const u,
const uint8_t* const v, const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G, __m128i* const R, __m128i* const G,
__m128i* const B) { __m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u), const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y,
static WEBP_INLINE void PlanarTo24b_SSE41( static WEBP_INLINE void PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2, __m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5, __m128i* const in3, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) { uint8_t* WEBP_RESTRICT const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation, // The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values. // let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it // To pack, we will keep taking one every two 8b integer and move it
@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41(
_mm_storeu_si128((__m128i*)(rgb + 80), *in5); _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
} }
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
} }
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
uint8_t* dst) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions // Arbitrary-length row conversion functions
static void YuvToRgbRow_SSE41(const uint8_t* y, static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n; int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y,
} }
} }
static void YuvToBgrRow_SSE41(const uint8_t* y, static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len) { const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n; int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order. // Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE41( static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) { const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32)); const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
// Convert 8 packed ARGB to r[], g[], b[] // Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE41( static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
const uint32_t* const argb, __m128i* const rgb /*in[6]*/) { const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0); __m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4); __m128i a1 = LOAD_16(argb + 4);
@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
#undef MK_CST_16 #undef MK_CST_16
#undef TRANSFORM #undef TRANSFORM
static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) { for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
} }
} }
static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) { for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
} }
} }
static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) { static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~15; const int max_width = width & ~15;
int i; int i;
for (i = 0; i < max_width; i += 16) { for (i = 0; i < max_width; i += 16) {
@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A,
*out = _mm_packs_epi32(C, D); *out = _mm_packs_epi32(C, D);
} }
static void ConvertARGBToUV_SSE41(const uint32_t* argb, static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
uint8_t* u, uint8_t* v, uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) { int src_width, int do_store) {
const int max_width = src_width & ~31; const int max_width = src_width & ~31;
int i; int i;
@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb,
// Convert 16 packed ARGB 16b-values to r[], g[], b[] // Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
const uint16_t* const rgbx, const uint16_t* WEBP_RESTRICT const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) { __m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
*b = _mm_unpackhi_epi64(B1, B3); *b = _mm_unpackhi_epi64(B1, B3);
} }
static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb, static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* u, uint8_t* v, int width) { uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
const int max_width = width & ~15; const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width; const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) { while (rgb < last_rgb) {

View File

@ -276,6 +276,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
stats->lossless_features = best.stats.lossless_features; stats->lossless_features = best.stats.lossless_features;
stats->histogram_bits = best.stats.histogram_bits; stats->histogram_bits = best.stats.histogram_bits;
stats->transform_bits = best.stats.transform_bits; stats->transform_bits = best.stats.transform_bits;
stats->cross_color_transform_bits = best.stats.cross_color_transform_bits;
stats->cache_bits = best.stats.cache_bits; stats->cache_bits = best.stats.cache_bits;
stats->palette_size = best.stats.palette_size; stats->palette_size = best.stats.palette_size;
stats->lossless_size = best.stats.lossless_size; stats->lossless_size = best.stats.lossless_size;

View File

@ -15,7 +15,7 @@
// //
#include <assert.h> #include <assert.h>
#include <float.h> #include <string.h>
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/enc/backward_references_enc.h" #include "src/enc/backward_references_enc.h"
@ -31,15 +31,15 @@ extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v); const PixOrCopy v);
typedef struct { typedef struct {
float alpha_[VALUES_IN_BYTE]; uint32_t alpha_[VALUES_IN_BYTE];
float red_[VALUES_IN_BYTE]; uint32_t red_[VALUES_IN_BYTE];
float blue_[VALUES_IN_BYTE]; uint32_t blue_[VALUES_IN_BYTE];
float distance_[NUM_DISTANCE_CODES]; uint32_t distance_[NUM_DISTANCE_CODES];
float* literal_; uint32_t* literal_;
} CostModel; } CostModel;
static void ConvertPopulationCountTableToBitEstimates( static void ConvertPopulationCountTableToBitEstimates(
int num_symbols, const uint32_t population_counts[], float output[]) { int num_symbols, const uint32_t population_counts[], uint32_t output[]) {
uint32_t sum = 0; uint32_t sum = 0;
int nonzeros = 0; int nonzeros = 0;
int i; int i;
@ -52,7 +52,7 @@ static void ConvertPopulationCountTableToBitEstimates(
if (nonzeros <= 1) { if (nonzeros <= 1) {
memset(output, 0, num_symbols * sizeof(*output)); memset(output, 0, num_symbols * sizeof(*output));
} else { } else {
const float logsum = VP8LFastLog2(sum); const uint32_t logsum = VP8LFastLog2(sum);
for (i = 0; i < num_symbols; ++i) { for (i = 0; i < num_symbols; ++i) {
output[i] = logsum - VP8LFastLog2(population_counts[i]); output[i] = logsum - VP8LFastLog2(population_counts[i]);
} }
@ -93,47 +93,47 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
return ok; return ok;
} }
static WEBP_INLINE float GetLiteralCost(const CostModel* const m, uint32_t v) { static WEBP_INLINE int64_t GetLiteralCost(const CostModel* const m,
return m->alpha_[v >> 24] + uint32_t v) {
m->red_[(v >> 16) & 0xff] + return (int64_t)m->alpha_[v >> 24] + m->red_[(v >> 16) & 0xff] +
m->literal_[(v >> 8) & 0xff] + m->literal_[(v >> 8) & 0xff] + m->blue_[v & 0xff];
m->blue_[v & 0xff];
} }
static WEBP_INLINE float GetCacheCost(const CostModel* const m, uint32_t idx) { static WEBP_INLINE int64_t GetCacheCost(const CostModel* const m,
uint32_t idx) {
const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx; const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
return m->literal_[literal_idx]; return (int64_t)m->literal_[literal_idx];
} }
static WEBP_INLINE float GetLengthCost(const CostModel* const m, static WEBP_INLINE int64_t GetLengthCost(const CostModel* const m,
uint32_t length) { uint32_t length) {
int code, extra_bits; int code, extra_bits;
VP8LPrefixEncodeBits(length, &code, &extra_bits); VP8LPrefixEncodeBits(length, &code, &extra_bits);
return m->literal_[VALUES_IN_BYTE + code] + extra_bits; return (int64_t)m->literal_[VALUES_IN_BYTE + code] +
((int64_t)extra_bits << LOG_2_PRECISION_BITS);
} }
static WEBP_INLINE float GetDistanceCost(const CostModel* const m, static WEBP_INLINE int64_t GetDistanceCost(const CostModel* const m,
uint32_t distance) { uint32_t distance) {
int code, extra_bits; int code, extra_bits;
VP8LPrefixEncodeBits(distance, &code, &extra_bits); VP8LPrefixEncodeBits(distance, &code, &extra_bits);
return m->distance_[code] + extra_bits; return (int64_t)m->distance_[code] +
((int64_t)extra_bits << LOG_2_PRECISION_BITS);
} }
static WEBP_INLINE void AddSingleLiteralWithCostModel( static WEBP_INLINE void AddSingleLiteralWithCostModel(
const uint32_t* const argb, VP8LColorCache* const hashers, const uint32_t* const argb, VP8LColorCache* const hashers,
const CostModel* const cost_model, int idx, int use_color_cache, const CostModel* const cost_model, int idx, int use_color_cache,
float prev_cost, float* const cost, uint16_t* const dist_array) { int64_t prev_cost, int64_t* const cost, uint16_t* const dist_array) {
float cost_val = prev_cost; int64_t cost_val = prev_cost;
const uint32_t color = argb[idx]; const uint32_t color = argb[idx];
const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1; const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
if (ix >= 0) { if (ix >= 0) {
// use_color_cache is true and hashers contains color // use_color_cache is true and hashers contains color
const float mul0 = 0.68f; cost_val += DivRound(GetCacheCost(cost_model, ix) * 68, 100);
cost_val += GetCacheCost(cost_model, ix) * mul0;
} else { } else {
const float mul1 = 0.82f;
if (use_color_cache) VP8LColorCacheInsert(hashers, color); if (use_color_cache) VP8LColorCacheInsert(hashers, color);
cost_val += GetLiteralCost(cost_model, color) * mul1; cost_val += DivRound(GetLiteralCost(cost_model, color) * 82, 100);
} }
if (cost[idx] > cost_val) { if (cost[idx] > cost_val) {
cost[idx] = cost_val; cost[idx] = cost_val;
@ -163,7 +163,7 @@ static WEBP_INLINE void AddSingleLiteralWithCostModel(
// therefore no overlapping intervals. // therefore no overlapping intervals.
typedef struct CostInterval CostInterval; typedef struct CostInterval CostInterval;
struct CostInterval { struct CostInterval {
float cost_; int64_t cost_;
int start_; int start_;
int end_; int end_;
int index_; int index_;
@ -173,7 +173,7 @@ struct CostInterval {
// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval. // The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
typedef struct { typedef struct {
float cost_; int64_t cost_;
int start_; int start_;
int end_; // Exclusive. int end_; // Exclusive.
} CostCacheInterval; } CostCacheInterval;
@ -188,8 +188,9 @@ typedef struct {
int count_; // The number of stored intervals. int count_; // The number of stored intervals.
CostCacheInterval* cache_intervals_; CostCacheInterval* cache_intervals_;
size_t cache_intervals_size_; size_t cache_intervals_size_;
float cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k). // Contains the GetLengthCost(cost_model, k).
float* costs_; int64_t cost_cache_[MAX_LENGTH];
int64_t* costs_;
uint16_t* dist_array_; uint16_t* dist_array_;
// Most of the time, we only need few intervals -> use a free-list, to avoid // Most of the time, we only need few intervals -> use a free-list, to avoid
// fragmentation with small allocs in most common cases. // fragmentation with small allocs in most common cases.
@ -298,7 +299,7 @@ static int CostManagerInit(CostManager* const manager,
cur->end_ = 1; cur->end_ = 1;
cur->cost_ = manager->cost_cache_[0]; cur->cost_ = manager->cost_cache_[0];
for (i = 1; i < cost_cache_size; ++i) { for (i = 1; i < cost_cache_size; ++i) {
const float cost_val = manager->cost_cache_[i]; const int64_t cost_val = manager->cost_cache_[i];
if (cost_val != cur->cost_) { if (cost_val != cur->cost_) {
++cur; ++cur;
// Initialize an interval. // Initialize an interval.
@ -311,13 +312,15 @@ static int CostManagerInit(CostManager* const manager,
manager->cache_intervals_size_); manager->cache_intervals_size_);
} }
manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_)); manager->costs_ =
(int64_t*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
if (manager->costs_ == NULL) { if (manager->costs_ == NULL) {
CostManagerClear(manager); CostManagerClear(manager);
return 0; return 0;
} }
// Set the initial costs_ high for every pixel as we will keep the minimum. // Set the initial costs_ to INT64_MAX for every pixel as we will keep the
for (i = 0; i < pix_count; ++i) manager->costs_[i] = FLT_MAX; // minimum.
for (i = 0; i < pix_count; ++i) manager->costs_[i] = WEBP_INT64_MAX;
return 1; return 1;
} }
@ -325,7 +328,7 @@ static int CostManagerInit(CostManager* const manager,
// Given the cost and the position that define an interval, update the cost at // Given the cost and the position that define an interval, update the cost at
// pixel 'i' if it is smaller than the previously computed value. // pixel 'i' if it is smaller than the previously computed value.
static WEBP_INLINE void UpdateCost(CostManager* const manager, int i, static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
int position, float cost) { int position, int64_t cost) {
const int k = i - position; const int k = i - position;
assert(k >= 0 && k < MAX_LENGTH); assert(k >= 0 && k < MAX_LENGTH);
@ -339,7 +342,7 @@ static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
// all the pixels between 'start' and 'end' excluded. // all the pixels between 'start' and 'end' excluded.
static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager, static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
int start, int end, int position, int start, int end, int position,
float cost) { int64_t cost) {
int i; int i;
for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost); for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
} }
@ -424,7 +427,7 @@ static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
// interval_in as a hint. The intervals are sorted by start_ value. // interval_in as a hint. The intervals are sorted by start_ value.
static WEBP_INLINE void InsertInterval(CostManager* const manager, static WEBP_INLINE void InsertInterval(CostManager* const manager,
CostInterval* const interval_in, CostInterval* const interval_in,
float cost, int position, int start, int64_t cost, int position, int start,
int end) { int end) {
CostInterval* interval_new; CostInterval* interval_new;
@ -463,7 +466,7 @@ static WEBP_INLINE void InsertInterval(CostManager* const manager,
// If handling the interval or one of its subintervals becomes to heavy, its // If handling the interval or one of its subintervals becomes to heavy, its
// contribution is added to the costs right away. // contribution is added to the costs right away.
static WEBP_INLINE void PushInterval(CostManager* const manager, static WEBP_INLINE void PushInterval(CostManager* const manager,
float distance_cost, int position, int64_t distance_cost, int position,
int len) { int len) {
size_t i; size_t i;
CostInterval* interval = manager->head_; CostInterval* interval = manager->head_;
@ -478,7 +481,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
int j; int j;
for (j = position; j < position + len; ++j) { for (j = position; j < position + len; ++j) {
const int k = j - position; const int k = j - position;
float cost_tmp; int64_t cost_tmp;
assert(k >= 0 && k < MAX_LENGTH); assert(k >= 0 && k < MAX_LENGTH);
cost_tmp = distance_cost + manager->cost_cache_[k]; cost_tmp = distance_cost + manager->cost_cache_[k];
@ -498,7 +501,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
const int end = position + (cost_cache_intervals[i].end_ > len const int end = position + (cost_cache_intervals[i].end_ > len
? len ? len
: cost_cache_intervals[i].end_); : cost_cache_intervals[i].end_);
const float cost = distance_cost + cost_cache_intervals[i].cost_; const int64_t cost = distance_cost + cost_cache_intervals[i].cost_;
for (; interval != NULL && interval->start_ < end; for (; interval != NULL && interval->start_ < end;
interval = interval_next) { interval = interval_next) {
@ -576,7 +579,7 @@ static int BackwardReferencesHashChainDistanceOnly(
const int pix_count = xsize * ysize; const int pix_count = xsize * ysize;
const int use_color_cache = (cache_bits > 0); const int use_color_cache = (cache_bits > 0);
const size_t literal_array_size = const size_t literal_array_size =
sizeof(float) * (VP8LHistogramNumCodes(cache_bits)); sizeof(*((CostModel*)NULL)->literal_) * VP8LHistogramNumCodes(cache_bits);
const size_t cost_model_size = sizeof(CostModel) + literal_array_size; const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
CostModel* const cost_model = CostModel* const cost_model =
(CostModel*)WebPSafeCalloc(1ULL, cost_model_size); (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
@ -584,13 +587,13 @@ static int BackwardReferencesHashChainDistanceOnly(
CostManager* cost_manager = CostManager* cost_manager =
(CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager)); (CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
int offset_prev = -1, len_prev = -1; int offset_prev = -1, len_prev = -1;
float offset_cost = -1.f; int64_t offset_cost = -1;
int first_offset_is_constant = -1; // initialized with 'impossible' value int first_offset_is_constant = -1; // initialized with 'impossible' value
int reach = 0; int reach = 0;
if (cost_model == NULL || cost_manager == NULL) goto Error; if (cost_model == NULL || cost_manager == NULL) goto Error;
cost_model->literal_ = (float*)(cost_model + 1); cost_model->literal_ = (uint32_t*)(cost_model + 1);
if (use_color_cache) { if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits); cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error; if (!cc_init) goto Error;
@ -608,11 +611,12 @@ static int BackwardReferencesHashChainDistanceOnly(
// non-processed locations from this point. // non-processed locations from this point.
dist_array[0] = 0; dist_array[0] = 0;
// Add first pixel as literal. // Add first pixel as literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache, AddSingleLiteralWithCostModel(argb, &hashers, cost_model, /*idx=*/0,
0.f, cost_manager->costs_, dist_array); use_color_cache, /*prev_cost=*/0,
cost_manager->costs_, dist_array);
for (i = 1; i < pix_count; ++i) { for (i = 1; i < pix_count; ++i) {
const float prev_cost = cost_manager->costs_[i - 1]; const int64_t prev_cost = cost_manager->costs_[i - 1];
int offset, len; int offset, len;
VP8LHashChainFindCopy(hash_chain, i, &offset, &len); VP8LHashChainFindCopy(hash_chain, i, &offset, &len);

View File

@ -13,8 +13,6 @@
#include "src/enc/backward_references_enc.h" #include "src/enc/backward_references_enc.h"
#include <assert.h> #include <assert.h>
#include <float.h>
#include <math.h>
#include "src/dsp/dsp.h" #include "src/dsp/dsp.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
@ -27,8 +25,6 @@
#define MIN_BLOCK_SIZE 256 // minimum block size for backward references #define MIN_BLOCK_SIZE 256 // minimum block size for backward references
#define MAX_ENTROPY (1e30f)
// 1M window (4M bytes) minus 120 special codes for short distances. // 1M window (4M bytes) minus 120 special codes for short distances.
#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120) #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
@ -758,7 +754,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
int* const best_cache_bits) { int* const best_cache_bits) {
int i; int i;
const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits; const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
float entropy_min = MAX_ENTROPY; uint64_t entropy_min = WEBP_UINT64_MAX;
int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 }; int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1]; VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
VP8LRefsCursor c = VP8LRefsCursorInit(refs); VP8LRefsCursor c = VP8LRefsCursorInit(refs);
@ -843,7 +839,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
} }
for (i = 0; i <= cache_bits_max; ++i) { for (i = 0; i <= cache_bits_max; ++i) {
const float entropy = VP8LHistogramEstimateBits(histos[i]); const uint64_t entropy = VP8LHistogramEstimateBits(histos[i]);
if (i == 0 || entropy < entropy_min) { if (i == 0 || entropy < entropy_min) {
entropy_min = entropy; entropy_min = entropy;
*best_cache_bits = i; *best_cache_bits = i;
@ -920,7 +916,7 @@ static int GetBackwardReferences(int width, int height,
int i, lz77_type; int i, lz77_type;
// Index 0 is for a color cache, index 1 for no cache (if needed). // Index 0 is for a color cache, index 1 for no cache (if needed).
int lz77_types_best[2] = {0, 0}; int lz77_types_best[2] = {0, 0};
float bit_costs_best[2] = {FLT_MAX, FLT_MAX}; uint64_t bit_costs_best[2] = {WEBP_UINT64_MAX, WEBP_UINT64_MAX};
VP8LHashChain hash_chain_box; VP8LHashChain hash_chain_box;
VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1]; VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
int status = 0; int status = 0;
@ -932,7 +928,7 @@ static int GetBackwardReferences(int width, int height,
for (lz77_type = 1; lz77_types_to_try; for (lz77_type = 1; lz77_types_to_try;
lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) { lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
int res = 0; int res = 0;
float bit_cost = 0.f; uint64_t bit_cost = 0u;
if ((lz77_types_to_try & lz77_type) == 0) continue; if ((lz77_types_to_try & lz77_type) == 0) continue;
switch (lz77_type) { switch (lz77_type) {
case kLZ77RLE: case kLZ77RLE:
@ -1006,7 +1002,7 @@ static int GetBackwardReferences(int width, int height,
const VP8LHashChain* const hash_chain_tmp = const VP8LHashChain* const hash_chain_tmp =
(lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box; (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
const int cache_bits = (i == 1) ? 0 : *cache_bits_best; const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
float bit_cost_trace; uint64_t bit_cost_trace;
if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits, if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
hash_chain_tmp, &refs[i], hash_chain_tmp, &refs[i],
refs_tmp)) { refs_tmp)) {

View File

@ -55,7 +55,6 @@ int WebPConfigInitInternal(WebPConfig* config,
config->thread_level = 0; config->thread_level = 0;
config->low_memory = 0; config->low_memory = 0;
config->near_lossless = 100; config->near_lossless = 100;
config->use_delta_palette = 0;
config->use_sharp_yuv = 0; config->use_sharp_yuv = 0;
// TODO(skal): tune. // TODO(skal): tune.
@ -125,9 +124,6 @@ int WebPValidateConfig(const WebPConfig* config) {
if (config->thread_level < 0 || config->thread_level > 1) return 0; if (config->thread_level < 0 || config->thread_level > 1) return 0;
if (config->low_memory < 0 || config->low_memory > 1) return 0; if (config->low_memory < 0 || config->low_memory > 1) return 0;
if (config->exact < 0 || config->exact > 1) return 0; if (config->exact < 0 || config->exact > 1) return 0;
if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
return 0;
}
if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0; if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
return 1; return 1;

View File

@ -19,7 +19,7 @@
// For each given level, the following table gives the pattern of contexts to // For each given level, the following table gives the pattern of contexts to
// use for coding it (in [][0]) as well as the bit value to use for each // use for coding it (in [][0]) as well as the bit value to use for each
// context (in [][1]). // context (in [][1]).
const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = { static const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
{0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005}, {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
{0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023}, {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
{0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013}, {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},

View File

@ -61,7 +61,6 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
} }
// Level cost calculations // Level cost calculations
extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
void VP8CalculateLevelCosts(VP8EncProba* const proba); void VP8CalculateLevelCosts(VP8EncProba* const proba);
static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) { static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
return VP8LevelFixedCosts[level] return VP8LevelFixedCosts[level]

View File

@ -13,8 +13,7 @@
#include "src/webp/config.h" #include "src/webp/config.h"
#endif #endif
#include <float.h> #include <string.h>
#include <math.h>
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
@ -23,8 +22,6 @@
#include "src/enc/vp8i_enc.h" #include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h" #include "src/utils/utils.h"
#define MAX_BIT_COST FLT_MAX
// Number of partitions for the three dominant (literal, red and blue) symbol // Number of partitions for the three dominant (literal, red and blue) symbol
// costs. // costs.
#define NUM_PARTITIONS 4 #define NUM_PARTITIONS 4
@ -33,10 +30,18 @@
// Maximum number of histograms allowed in greedy combining algorithm. // Maximum number of histograms allowed in greedy combining algorithm.
#define MAX_HISTO_GREEDY 100 #define MAX_HISTO_GREEDY 100
// Return the size of the histogram for a given cache_bits.
static int GetHistogramSize(int cache_bits) {
const int literal_size = VP8LHistogramNumCodes(cache_bits);
const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
assert(total_size <= (size_t)0x7fffffff);
return (int)total_size;
}
static void HistogramClear(VP8LHistogram* const p) { static void HistogramClear(VP8LHistogram* const p) {
uint32_t* const literal = p->literal_; uint32_t* const literal = p->literal_;
const int cache_bits = p->palette_code_bits_; const int cache_bits = p->palette_code_bits_;
const int histo_size = VP8LGetHistogramSize(cache_bits); const int histo_size = GetHistogramSize(cache_bits);
memset(p, 0, histo_size); memset(p, 0, histo_size);
p->palette_code_bits_ = cache_bits; p->palette_code_bits_ = cache_bits;
p->literal_ = literal; p->literal_ = literal;
@ -54,20 +59,13 @@ static void HistogramCopy(const VP8LHistogram* const src,
uint32_t* const dst_literal = dst->literal_; uint32_t* const dst_literal = dst->literal_;
const int dst_cache_bits = dst->palette_code_bits_; const int dst_cache_bits = dst->palette_code_bits_;
const int literal_size = VP8LHistogramNumCodes(dst_cache_bits); const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
const int histo_size = VP8LGetHistogramSize(dst_cache_bits); const int histo_size = GetHistogramSize(dst_cache_bits);
assert(src->palette_code_bits_ == dst_cache_bits); assert(src->palette_code_bits_ == dst_cache_bits);
memcpy(dst, src, histo_size); memcpy(dst, src, histo_size);
dst->literal_ = dst_literal; dst->literal_ = dst_literal;
memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_)); memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
} }
int VP8LGetHistogramSize(int cache_bits) {
const int literal_size = VP8LHistogramNumCodes(cache_bits);
const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
assert(total_size <= (size_t)0x7fffffff);
return (int)total_size;
}
void VP8LFreeHistogram(VP8LHistogram* const histo) { void VP8LFreeHistogram(VP8LHistogram* const histo) {
WebPSafeFree(histo); WebPSafeFree(histo);
} }
@ -102,17 +100,17 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
HistogramClear(p); HistogramClear(p);
} else { } else {
p->trivial_symbol_ = 0; p->trivial_symbol_ = 0;
p->bit_cost_ = 0.; p->bit_cost_ = 0;
p->literal_cost_ = 0.; p->literal_cost_ = 0;
p->red_cost_ = 0.; p->red_cost_ = 0;
p->blue_cost_ = 0.; p->blue_cost_ = 0;
memset(p->is_used_, 0, sizeof(p->is_used_)); memset(p->is_used_, 0, sizeof(p->is_used_));
} }
} }
VP8LHistogram* VP8LAllocateHistogram(int cache_bits) { VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
VP8LHistogram* histo = NULL; VP8LHistogram* histo = NULL;
const int total_size = VP8LGetHistogramSize(cache_bits); const int total_size = GetHistogramSize(cache_bits);
uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory)); uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
if (memory == NULL) return NULL; if (memory == NULL) return NULL;
histo = (VP8LHistogram*)memory; histo = (VP8LHistogram*)memory;
@ -126,7 +124,7 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
static void HistogramSetResetPointers(VP8LHistogramSet* const set, static void HistogramSetResetPointers(VP8LHistogramSet* const set,
int cache_bits) { int cache_bits) {
int i; int i;
const int histo_size = VP8LGetHistogramSize(cache_bits); const int histo_size = GetHistogramSize(cache_bits);
uint8_t* memory = (uint8_t*) (set->histograms); uint8_t* memory = (uint8_t*) (set->histograms);
memory += set->max_size * sizeof(*set->histograms); memory += set->max_size * sizeof(*set->histograms);
for (i = 0; i < set->max_size; ++i) { for (i = 0; i < set->max_size; ++i) {
@ -140,7 +138,7 @@ static void HistogramSetResetPointers(VP8LHistogramSet* const set,
// Returns the total size of the VP8LHistogramSet. // Returns the total size of the VP8LHistogramSet.
static size_t HistogramSetTotalSize(int size, int cache_bits) { static size_t HistogramSetTotalSize(int size, int cache_bits) {
const int histo_size = VP8LGetHistogramSize(cache_bits); const int histo_size = GetHistogramSize(cache_bits);
return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) + return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
histo_size + WEBP_ALIGN_CST)); histo_size + WEBP_ALIGN_CST));
} }
@ -230,8 +228,8 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// Entropy-related functions. // Entropy-related functions.
static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) { static WEBP_INLINE uint64_t BitsEntropyRefine(const VP8LBitEntropy* entropy) {
float mix; uint64_t mix;
if (entropy->nonzeros < 5) { if (entropy->nonzeros < 5) {
if (entropy->nonzeros <= 1) { if (entropy->nonzeros <= 1) {
return 0; return 0;
@ -240,67 +238,72 @@ static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
// Let's mix in a bit of entropy to favor good clustering when // Let's mix in a bit of entropy to favor good clustering when
// distributions of these are combined. // distributions of these are combined.
if (entropy->nonzeros == 2) { if (entropy->nonzeros == 2) {
return 0.99f * entropy->sum + 0.01f * entropy->entropy; return DivRound(99 * ((uint64_t)entropy->sum << LOG_2_PRECISION_BITS) +
entropy->entropy,
100);
} }
// No matter what the entropy says, we cannot be better than min_limit // No matter what the entropy says, we cannot be better than min_limit
// with Huffman coding. I am mixing a bit of entropy into the // with Huffman coding. I am mixing a bit of entropy into the
// min_limit since it produces much better (~0.5 %) compression results // min_limit since it produces much better (~0.5 %) compression results
// perhaps because of better entropy clustering. // perhaps because of better entropy clustering.
if (entropy->nonzeros == 3) { if (entropy->nonzeros == 3) {
mix = 0.95f; mix = 950;
} else { } else {
mix = 0.7f; // nonzeros == 4. mix = 700; // nonzeros == 4.
} }
} else { } else {
mix = 0.627f; mix = 627;
} }
{ {
float min_limit = 2.f * entropy->sum - entropy->max_val; uint64_t min_limit = (uint64_t)(2 * entropy->sum - entropy->max_val)
min_limit = mix * min_limit + (1.f - mix) * entropy->entropy; << LOG_2_PRECISION_BITS;
min_limit =
DivRound(mix * min_limit + (1000 - mix) * entropy->entropy, 1000);
return (entropy->entropy < min_limit) ? min_limit : entropy->entropy; return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
} }
} }
float VP8LBitsEntropy(const uint32_t* const array, int n) { uint64_t VP8LBitsEntropy(const uint32_t* const array, int n) {
VP8LBitEntropy entropy; VP8LBitEntropy entropy;
VP8LBitsEntropyUnrefined(array, n, &entropy); VP8LBitsEntropyUnrefined(array, n, &entropy);
return BitsEntropyRefine(&entropy); return BitsEntropyRefine(&entropy);
} }
static float InitialHuffmanCost(void) { static uint64_t InitialHuffmanCost(void) {
// Small bias because Huffman code length is typically not stored in // Small bias because Huffman code length is typically not stored in
// full length. // full length.
static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3; static const uint64_t kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
static const float kSmallBias = 9.1f; // Subtract a bias of 9.1.
return kHuffmanCodeOfHuffmanCodeSize - kSmallBias; return (kHuffmanCodeOfHuffmanCodeSize << LOG_2_PRECISION_BITS) -
DivRound(91ll << LOG_2_PRECISION_BITS, 10);
} }
// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3) // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
static float FinalHuffmanCost(const VP8LStreaks* const stats) { static uint64_t FinalHuffmanCost(const VP8LStreaks* const stats) {
// The constants in this function are experimental and got rounded from // The constants in this function are empirical and got rounded from
// their original values in 1/8 when switched to 1/1024. // their original values in 1/8 when switched to 1/1024.
float retval = InitialHuffmanCost(); uint64_t retval = InitialHuffmanCost();
// Second coefficient: Many zeros in the histogram are covered efficiently // Second coefficient: Many zeros in the histogram are covered efficiently
// by a run-length encode. Originally 2/8. // by a run-length encode. Originally 2/8.
retval += stats->counts[0] * 1.5625f + 0.234375f * stats->streaks[0][1]; uint32_t retval_extra = stats->counts[0] * 1600 + 240 * stats->streaks[0][1];
// Second coefficient: Constant values are encoded less efficiently, but still // Second coefficient: Constant values are encoded less efficiently, but still
// RLE'ed. Originally 6/8. // RLE'ed. Originally 6/8.
retval += stats->counts[1] * 2.578125f + 0.703125f * stats->streaks[1][1]; retval_extra += stats->counts[1] * 2640 + 720 * stats->streaks[1][1];
// 0s are usually encoded more efficiently than non-0s. // 0s are usually encoded more efficiently than non-0s.
// Originally 15/8. // Originally 15/8.
retval += 1.796875f * stats->streaks[0][0]; retval_extra += 1840 * stats->streaks[0][0];
// Originally 26/8. // Originally 26/8.
retval += 3.28125f * stats->streaks[1][0]; retval_extra += 3360 * stats->streaks[1][0];
return retval; return retval + ((uint64_t)retval_extra << (LOG_2_PRECISION_BITS - 10));
} }
// Get the symbol entropy for the distribution 'population'. // Get the symbol entropy for the distribution 'population'.
// Set 'trivial_sym', if there's only one symbol present in the distribution. // Set 'trivial_sym', if there's only one symbol present in the distribution.
static float PopulationCost(const uint32_t* const population, int length, static uint64_t PopulationCost(const uint32_t* const population, int length,
uint32_t* const trivial_sym, uint32_t* const trivial_sym,
uint8_t* const is_used) { uint8_t* const is_used) {
VP8LBitEntropy bit_entropy; VP8LBitEntropy bit_entropy;
VP8LStreaks stats; VP8LStreaks stats;
VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats); VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@ -316,10 +319,11 @@ static float PopulationCost(const uint32_t* const population, int length,
// trivial_at_end is 1 if the two histograms only have one element that is // trivial_at_end is 1 if the two histograms only have one element that is
// non-zero: both the zero-th one, or both the last one. // non-zero: both the zero-th one, or both the last one.
static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X, static WEBP_INLINE uint64_t GetCombinedEntropy(const uint32_t* const X,
const uint32_t* const Y, int length, const uint32_t* const Y,
int is_X_used, int is_Y_used, int length, int is_X_used,
int trivial_at_end) { int is_Y_used,
int trivial_at_end) {
VP8LStreaks stats; VP8LStreaks stats;
if (trivial_at_end) { if (trivial_at_end) {
// This configuration is due to palettization that transforms an indexed // This configuration is due to palettization that transforms an indexed
@ -357,7 +361,7 @@ static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
} }
// Estimates the Entropy + Huffman + other block overhead size cost. // Estimates the Entropy + Huffman + other block overhead size cost.
float VP8LHistogramEstimateBits(VP8LHistogram* const p) { uint64_t VP8LHistogramEstimateBits(VP8LHistogram* const p) {
return PopulationCost(p->literal_, return PopulationCost(p->literal_,
VP8LHistogramNumCodes(p->palette_code_bits_), NULL, VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
&p->is_used_[0]) + &p->is_used_[0]) +
@ -366,27 +370,42 @@ float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
&p->is_used_[4]) + &p->is_used_[4]) +
(float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, ((uint64_t)(VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES) + NUM_LENGTH_CODES) +
(float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES); VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES))
<< LOG_2_PRECISION_BITS);
} }
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// Various histogram combine/cost-eval functions // Various histogram combine/cost-eval functions
static int GetCombinedHistogramEntropy(const VP8LHistogram* const a, // Set a + b in b, saturating at WEBP_INT64_MAX.
const VP8LHistogram* const b, static WEBP_INLINE void SaturateAdd(uint64_t a, int64_t* b) {
float cost_threshold, float* cost) { if (*b < 0 || (int64_t)a <= WEBP_INT64_MAX - *b) {
*b += (int64_t)a;
} else {
*b = WEBP_INT64_MAX;
}
}
// Returns 1 if the cost of the combined histogram is less than the threshold.
// Otherwise returns 0 and the cost is invalid due to early bail-out.
WEBP_NODISCARD static int GetCombinedHistogramEntropy(
const VP8LHistogram* const a, const VP8LHistogram* const b,
int64_t cost_threshold_in, uint64_t* cost) {
const int palette_code_bits = a->palette_code_bits_; const int palette_code_bits = a->palette_code_bits_;
int trivial_at_end = 0; int trivial_at_end = 0;
const uint64_t cost_threshold = (uint64_t)cost_threshold_in;
assert(a->palette_code_bits_ == b->palette_code_bits_); assert(a->palette_code_bits_ == b->palette_code_bits_);
*cost += GetCombinedEntropy(a->literal_, b->literal_, if (cost_threshold_in <= 0) return 0;
VP8LHistogramNumCodes(palette_code_bits), *cost = GetCombinedEntropy(a->literal_, b->literal_,
a->is_used_[0], b->is_used_[0], 0); VP8LHistogramNumCodes(palette_code_bits),
*cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES, a->is_used_[0], b->is_used_[0], 0);
b->literal_ + NUM_LITERAL_CODES, *cost += (uint64_t)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES); b->literal_ + NUM_LITERAL_CODES,
if (*cost > cost_threshold) return 0; NUM_LENGTH_CODES)
<< LOG_2_PRECISION_BITS;
if (*cost >= cost_threshold) return 0;
if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM && if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
a->trivial_symbol_ == b->trivial_symbol_) { a->trivial_symbol_ == b->trivial_symbol_) {
@ -401,27 +420,24 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
} }
} }
*cost += *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES,
GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1], a->is_used_[1], b->is_used_[1], trivial_at_end);
b->is_used_[1], trivial_at_end); if (*cost >= cost_threshold) return 0;
if (*cost > cost_threshold) return 0;
*cost += *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES,
GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2], a->is_used_[2], b->is_used_[2], trivial_at_end);
b->is_used_[2], trivial_at_end); if (*cost >= cost_threshold) return 0;
if (*cost > cost_threshold) return 0;
*cost += *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES, a->is_used_[3], b->is_used_[3], trivial_at_end);
a->is_used_[3], b->is_used_[3], trivial_at_end); if (*cost >= cost_threshold) return 0;
if (*cost > cost_threshold) return 0;
*cost += *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, a->is_used_[4], b->is_used_[4], 0);
a->is_used_[4], b->is_used_[4], 0); *cost += (uint64_t)VP8LExtraCostCombined(a->distance_, b->distance_,
*cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES)
NUM_DISTANCE_CODES); << LOG_2_PRECISION_BITS;
if (*cost > cost_threshold) return 0; if (*cost >= cost_threshold) return 0;
return 1; return 1;
} }
@ -441,33 +457,39 @@ static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
// Since the previous score passed is 'cost_threshold', we only need to compare // Since the previous score passed is 'cost_threshold', we only need to compare
// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out // the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
// early. // early.
static float HistogramAddEval(const VP8LHistogram* const a, // Returns 1 if the cost is less than the threshold.
const VP8LHistogram* const b, // Otherwise returns 0 and the cost is invalid due to early bail-out.
VP8LHistogram* const out, float cost_threshold) { WEBP_NODISCARD static int HistogramAddEval(const VP8LHistogram* const a,
float cost = 0; const VP8LHistogram* const b,
const float sum_cost = a->bit_cost_ + b->bit_cost_; VP8LHistogram* const out,
cost_threshold += sum_cost; int64_t cost_threshold) {
uint64_t cost;
const uint64_t sum_cost = a->bit_cost_ + b->bit_cost_;
SaturateAdd(sum_cost, &cost_threshold);
if (!GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) return 0;
if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) { HistogramAdd(a, b, out);
HistogramAdd(a, b, out); out->bit_cost_ = cost;
out->bit_cost_ = cost; out->palette_code_bits_ = a->palette_code_bits_;
out->palette_code_bits_ = a->palette_code_bits_; return 1;
}
return cost - sum_cost;
} }
// Same as HistogramAddEval(), except that the resulting histogram // Same as HistogramAddEval(), except that the resulting histogram
// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit // is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
// the term C(b) which is constant over all the evaluations. // the term C(b) which is constant over all the evaluations.
static float HistogramAddThresh(const VP8LHistogram* const a, // Returns 1 if the cost is less than the threshold.
const VP8LHistogram* const b, // Otherwise returns 0 and the cost is invalid due to early bail-out.
float cost_threshold) { WEBP_NODISCARD static int HistogramAddThresh(const VP8LHistogram* const a,
float cost; const VP8LHistogram* const b,
int64_t cost_threshold,
int64_t* cost_out) {
uint64_t cost;
assert(a != NULL && b != NULL); assert(a != NULL && b != NULL);
cost = -a->bit_cost_; SaturateAdd(a->bit_cost_, &cost_threshold);
GetCombinedHistogramEntropy(a, b, cost_threshold, &cost); if (!GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) return 0;
return cost;
*cost_out = (int64_t)cost - (int64_t)a->bit_cost_;
return 1;
} }
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
@ -475,21 +497,21 @@ static float HistogramAddThresh(const VP8LHistogram* const a,
// The structure to keep track of cost range for the three dominant entropy // The structure to keep track of cost range for the three dominant entropy
// symbols. // symbols.
typedef struct { typedef struct {
float literal_max_; uint64_t literal_max_;
float literal_min_; uint64_t literal_min_;
float red_max_; uint64_t red_max_;
float red_min_; uint64_t red_min_;
float blue_max_; uint64_t blue_max_;
float blue_min_; uint64_t blue_min_;
} DominantCostRange; } DominantCostRange;
static void DominantCostRangeInit(DominantCostRange* const c) { static void DominantCostRangeInit(DominantCostRange* const c) {
c->literal_max_ = 0.; c->literal_max_ = 0;
c->literal_min_ = MAX_BIT_COST; c->literal_min_ = WEBP_UINT64_MAX;
c->red_max_ = 0.; c->red_max_ = 0;
c->red_min_ = MAX_BIT_COST; c->red_min_ = WEBP_UINT64_MAX;
c->blue_max_ = 0.; c->blue_max_ = 0;
c->blue_min_ = MAX_BIT_COST; c->blue_min_ = WEBP_UINT64_MAX;
} }
static void UpdateDominantCostRange( static void UpdateDominantCostRange(
@ -504,15 +526,18 @@ static void UpdateDominantCostRange(
static void UpdateHistogramCost(VP8LHistogram* const h) { static void UpdateHistogramCost(VP8LHistogram* const h) {
uint32_t alpha_sym, red_sym, blue_sym; uint32_t alpha_sym, red_sym, blue_sym;
const float alpha_cost = const uint64_t alpha_cost =
PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]); PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
const float distance_cost = const uint64_t distance_cost =
PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) + PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
(float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES); ((uint64_t)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES)
<< LOG_2_PRECISION_BITS);
const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_); const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
h->literal_cost_ = h->literal_cost_ =
PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) + PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
(float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES); ((uint64_t)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES)
<< LOG_2_PRECISION_BITS);
h->red_cost_ = h->red_cost_ =
PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]); PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
h->blue_cost_ = h->blue_cost_ =
@ -527,10 +552,10 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
} }
} }
static int GetBinIdForEntropy(float min, float max, float val) { static int GetBinIdForEntropy(uint64_t min, uint64_t max, uint64_t val) {
const float range = max - min; const uint64_t range = max - min;
if (range > 0.) { if (range > 0) {
const float delta = val - min; const uint64_t delta = val - min;
return (int)((NUM_PARTITIONS - 1e-6) * delta / range); return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
} else { } else {
return 0; return 0;
@ -576,11 +601,11 @@ static void HistogramBuild(
} }
// Copies the histograms and computes its bit_cost. // Copies the histograms and computes its bit_cost.
static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1); static const uint32_t kInvalidHistogramSymbol = (uint32_t)(-1);
static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo, static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
VP8LHistogramSet* const image_histo, VP8LHistogramSet* const image_histo,
int* const num_used, int* const num_used,
uint16_t* const histogram_symbols) { uint32_t* const histogram_symbols) {
int i, cluster_id; int i, cluster_id;
int num_used_orig = *num_used; int num_used_orig = *num_used;
VP8LHistogram** const orig_histograms = orig_histo->histograms; VP8LHistogram** const orig_histograms = orig_histo->histograms;
@ -639,11 +664,12 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
// Merges some histograms with same bin_id together if it's advantageous. // Merges some histograms with same bin_id together if it's advantageous.
// Sets the remaining histograms to NULL. // Sets the remaining histograms to NULL.
// 'combine_cost_factor' has to be divided by 100.
static void HistogramCombineEntropyBin( static void HistogramCombineEntropyBin(
VP8LHistogramSet* const image_histo, int* num_used, VP8LHistogramSet* const image_histo, int* num_used,
const uint16_t* const clusters, uint16_t* const cluster_mappings, const uint32_t* const clusters, uint16_t* const cluster_mappings,
VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins, VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
float combine_cost_factor, int low_effort) { int32_t combine_cost_factor, int low_effort) {
VP8LHistogram** const histograms = image_histo->histograms; VP8LHistogram** const histograms = image_histo->histograms;
int idx; int idx;
struct { struct {
@ -673,11 +699,11 @@ static void HistogramCombineEntropyBin(
cluster_mappings[clusters[idx]] = clusters[first]; cluster_mappings[clusters[idx]] = clusters[first];
} else { } else {
// try to merge #idx into #first (both share the same bin_id) // try to merge #idx into #first (both share the same bin_id)
const float bit_cost = histograms[idx]->bit_cost_; const uint64_t bit_cost = histograms[idx]->bit_cost_;
const float bit_cost_thresh = -bit_cost * combine_cost_factor; const int64_t bit_cost_thresh =
const float curr_cost_diff = HistogramAddEval( -DivRound((int64_t)bit_cost * combine_cost_factor, 100);
histograms[first], histograms[idx], cur_combo, bit_cost_thresh); if (HistogramAddEval(histograms[first], histograms[idx], cur_combo,
if (curr_cost_diff < bit_cost_thresh) { bit_cost_thresh)) {
// Try to merge two histograms only if the combo is a trivial one or // Try to merge two histograms only if the combo is a trivial one or
// the two candidate histograms are already non-trivial. // the two candidate histograms are already non-trivial.
// For some images, 'try_combine' turns out to be false for a lot of // For some images, 'try_combine' turns out to be false for a lot of
@ -724,8 +750,8 @@ static uint32_t MyRand(uint32_t* const seed) {
typedef struct { typedef struct {
int idx1; int idx1;
int idx2; int idx2;
float cost_diff; int64_t cost_diff;
float cost_combo; uint64_t cost_combo;
} HistogramPair; } HistogramPair;
typedef struct { typedef struct {
@ -765,7 +791,7 @@ static void HistoQueuePopPair(HistoQueue* const histo_queue,
// Check whether a pair in the queue should be updated as head or not. // Check whether a pair in the queue should be updated as head or not.
static void HistoQueueUpdateHead(HistoQueue* const histo_queue, static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
HistogramPair* const pair) { HistogramPair* const pair) {
assert(pair->cost_diff < 0.); assert(pair->cost_diff < 0);
assert(pair >= histo_queue->queue && assert(pair >= histo_queue->queue &&
pair < (histo_queue->queue + histo_queue->size)); pair < (histo_queue->queue + histo_queue->size));
assert(histo_queue->size > 0); assert(histo_queue->size > 0);
@ -778,29 +804,35 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
} }
// Update the cost diff and combo of a pair of histograms. This needs to be // Update the cost diff and combo of a pair of histograms. This needs to be
// called when the the histograms have been merged with a third one. // called when the histograms have been merged with a third one.
static void HistoQueueUpdatePair(const VP8LHistogram* const h1, // Returns 1 if the cost diff is less than the threshold.
const VP8LHistogram* const h2, float threshold, // Otherwise returns 0 and the cost is invalid due to early bail-out.
HistogramPair* const pair) { WEBP_NODISCARD static int HistoQueueUpdatePair(const VP8LHistogram* const h1,
const float sum_cost = h1->bit_cost_ + h2->bit_cost_; const VP8LHistogram* const h2,
pair->cost_combo = 0.; int64_t cost_threshold,
GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo); HistogramPair* const pair) {
pair->cost_diff = pair->cost_combo - sum_cost; const int64_t sum_cost = h1->bit_cost_ + h2->bit_cost_;
SaturateAdd(sum_cost, &cost_threshold);
if (!GetCombinedHistogramEntropy(h1, h2, cost_threshold, &pair->cost_combo)) {
return 0;
}
pair->cost_diff = (int64_t)pair->cost_combo - sum_cost;
return 1;
} }
// Create a pair from indices "idx1" and "idx2" provided its cost // Create a pair from indices "idx1" and "idx2" provided its cost
// is inferior to "threshold", a negative entropy. // is inferior to "threshold", a negative entropy.
// It returns the cost of the pair, or 0. if it superior to threshold. // It returns the cost of the pair, or 0 if it superior to threshold.
static float HistoQueuePush(HistoQueue* const histo_queue, static int64_t HistoQueuePush(HistoQueue* const histo_queue,
VP8LHistogram** const histograms, int idx1, VP8LHistogram** const histograms, int idx1,
int idx2, float threshold) { int idx2, int64_t threshold) {
const VP8LHistogram* h1; const VP8LHistogram* h1;
const VP8LHistogram* h2; const VP8LHistogram* h2;
HistogramPair pair; HistogramPair pair;
// Stop here if the queue is full. // Stop here if the queue is full.
if (histo_queue->size == histo_queue->max_size) return 0.; if (histo_queue->size == histo_queue->max_size) return 0;
assert(threshold <= 0.); assert(threshold <= 0);
if (idx1 > idx2) { if (idx1 > idx2) {
const int tmp = idx2; const int tmp = idx2;
idx2 = idx1; idx2 = idx1;
@ -811,10 +843,8 @@ static float HistoQueuePush(HistoQueue* const histo_queue,
h1 = histograms[idx1]; h1 = histograms[idx1];
h2 = histograms[idx2]; h2 = histograms[idx2];
HistoQueueUpdatePair(h1, h2, threshold, &pair);
// Do not even consider the pair if it does not improve the entropy. // Do not even consider the pair if it does not improve the entropy.
if (pair.cost_diff >= threshold) return 0.; if (!HistoQueueUpdatePair(h1, h2, threshold, &pair)) return 0;
histo_queue->queue[histo_queue->size++] = pair; histo_queue->queue[histo_queue->size++] = pair;
HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]); HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
@ -851,7 +881,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
for (j = i + 1; j < image_histo_size; ++j) { for (j = i + 1; j < image_histo_size; ++j) {
// Initialize queue. // Initialize queue.
if (image_histo->histograms[j] == NULL) continue; if (image_histo->histograms[j] == NULL) continue;
HistoQueuePush(&histo_queue, histograms, i, j, 0.); HistoQueuePush(&histo_queue, histograms, i, j, 0);
} }
} }
@ -879,7 +909,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
// Push new pairs formed with combined histogram to the queue. // Push new pairs formed with combined histogram to the queue.
for (i = 0; i < image_histo->size; ++i) { for (i = 0; i < image_histo->size; ++i) {
if (i == idx1 || image_histo->histograms[i] == NULL) continue; if (i == idx1 || image_histo->histograms[i] == NULL) continue;
HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.); HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0);
} }
} }
@ -937,8 +967,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
++tries_with_no_success < num_tries_no_success; ++tries_with_no_success < num_tries_no_success;
++iter) { ++iter) {
int* mapping_index; int* mapping_index;
float best_cost = int64_t best_cost =
(histo_queue.size == 0) ? 0.f : histo_queue.queue[0].cost_diff; (histo_queue.size == 0) ? 0 : histo_queue.queue[0].cost_diff;
int best_idx1 = -1, best_idx2 = 1; int best_idx1 = -1, best_idx2 = 1;
const uint32_t rand_range = (*num_used - 1) * (*num_used); const uint32_t rand_range = (*num_used - 1) * (*num_used);
// (*num_used) / 2 was chosen empirically. Less means faster but worse // (*num_used) / 2 was chosen empirically. Less means faster but worse
@ -947,7 +977,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
// Pick random samples. // Pick random samples.
for (j = 0; *num_used >= 2 && j < num_tries; ++j) { for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
float curr_cost; int64_t curr_cost;
// Choose two different histograms at random and try to combine them. // Choose two different histograms at random and try to combine them.
const uint32_t tmp = MyRand(&seed) % rand_range; const uint32_t tmp = MyRand(&seed) % rand_range;
uint32_t idx1 = tmp / (*num_used - 1); uint32_t idx1 = tmp / (*num_used - 1);
@ -1012,8 +1042,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
} }
if (do_eval) { if (do_eval) {
// Re-evaluate the cost of an updated pair. // Re-evaluate the cost of an updated pair.
HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p); if (!HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0,
if (p->cost_diff >= 0.) { p)) {
HistoQueuePopPair(&histo_queue, p); HistoQueuePopPair(&histo_queue, p);
continue; continue;
} }
@ -1040,7 +1070,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
// Note: we assume that out[]->bit_cost_ is already up-to-date. // Note: we assume that out[]->bit_cost_ is already up-to-date.
static void HistogramRemap(const VP8LHistogramSet* const in, static void HistogramRemap(const VP8LHistogramSet* const in,
VP8LHistogramSet* const out, VP8LHistogramSet* const out,
uint16_t* const symbols) { uint32_t* const symbols) {
int i; int i;
VP8LHistogram** const in_histo = in->histograms; VP8LHistogram** const in_histo = in->histograms;
VP8LHistogram** const out_histo = out->histograms; VP8LHistogram** const out_histo = out->histograms;
@ -1049,7 +1079,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
if (out_size > 1) { if (out_size > 1) {
for (i = 0; i < in_size; ++i) { for (i = 0; i < in_size; ++i) {
int best_out = 0; int best_out = 0;
float best_bits = MAX_BIT_COST; int64_t best_bits = WEBP_INT64_MAX;
int k; int k;
if (in_histo[i] == NULL) { if (in_histo[i] == NULL) {
// Arbitrarily set to the previous value if unused to help future LZ77. // Arbitrarily set to the previous value if unused to help future LZ77.
@ -1057,9 +1087,9 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
continue; continue;
} }
for (k = 0; k < out_size; ++k) { for (k = 0; k < out_size; ++k) {
float cur_bits; int64_t cur_bits;
cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits); if (HistogramAddThresh(out_histo[k], in_histo[i], best_bits,
if (k == 0 || cur_bits < best_bits) { &cur_bits)) {
best_bits = cur_bits; best_bits = cur_bits;
best_out = k; best_out = k;
} }
@ -1085,13 +1115,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
} }
} }
static float GetCombineCostFactor(int histo_size, int quality) { static int32_t GetCombineCostFactor(int histo_size, int quality) {
float combine_cost_factor = 0.16f; int32_t combine_cost_factor = 16;
if (quality < 90) { if (quality < 90) {
if (histo_size > 256) combine_cost_factor /= 2.f; if (histo_size > 256) combine_cost_factor /= 2;
if (histo_size > 512) combine_cost_factor /= 2.f; if (histo_size > 512) combine_cost_factor /= 2;
if (histo_size > 1024) combine_cost_factor /= 2.f; if (histo_size > 1024) combine_cost_factor /= 2;
if (quality <= 50) combine_cost_factor /= 2.f; if (quality <= 50) combine_cost_factor /= 2;
} }
return combine_cost_factor; return combine_cost_factor;
} }
@ -1101,10 +1131,10 @@ static float GetCombineCostFactor(int histo_size, int quality) {
// assign the smallest possible clusters values. // assign the smallest possible clusters values.
static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set, static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
uint16_t* const cluster_mappings, uint16_t* const cluster_mappings,
int num_clusters, uint32_t num_clusters,
uint16_t* const cluster_mappings_tmp, uint16_t* const cluster_mappings_tmp,
uint16_t* const symbols) { uint32_t* const symbols) {
int i, cluster_max; uint32_t i, cluster_max;
int do_continue = 1; int do_continue = 1;
// First, assign the lowest cluster to each pixel. // First, assign the lowest cluster to each pixel.
while (do_continue) { while (do_continue) {
@ -1128,7 +1158,7 @@ static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
set->max_size * sizeof(*cluster_mappings_tmp)); set->max_size * sizeof(*cluster_mappings_tmp));
assert(cluster_mappings[0] == 0); assert(cluster_mappings[0] == 0);
// Re-map the ids. // Re-map the ids.
for (i = 0; i < set->max_size; ++i) { for (i = 0; i < (uint32_t)set->max_size; ++i) {
int cluster; int cluster;
if (symbols[i] == kInvalidHistogramSymbol) continue; if (symbols[i] == kInvalidHistogramSymbol) continue;
cluster = cluster_mappings[symbols[i]]; cluster = cluster_mappings[symbols[i]];
@ -1142,7 +1172,7 @@ static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
// Make sure all cluster values are used. // Make sure all cluster values are used.
cluster_max = 0; cluster_max = 0;
for (i = 0; i < set->max_size; ++i) { for (i = 0; i < (uint32_t)set->max_size; ++i) {
if (symbols[i] == kInvalidHistogramSymbol) continue; if (symbols[i] == kInvalidHistogramSymbol) continue;
if (symbols[i] <= cluster_max) continue; if (symbols[i] <= cluster_max) continue;
++cluster_max; ++cluster_max;
@ -1165,7 +1195,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
int low_effort, int histogram_bits, int cache_bits, int low_effort, int histogram_bits, int cache_bits,
VP8LHistogramSet* const image_histo, VP8LHistogramSet* const image_histo,
VP8LHistogram* const tmp_histo, VP8LHistogram* const tmp_histo,
uint16_t* const histogram_symbols, uint32_t* const histogram_symbols,
const WebPPicture* const pic, int percent_range, const WebPPicture* const pic, int percent_range,
int* const percent) { int* const percent) {
const int histo_xsize = const int histo_xsize =
@ -1181,7 +1211,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE; const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
int entropy_combine; int entropy_combine;
uint16_t* const map_tmp = uint16_t* const map_tmp =
WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp)); (uint16_t*)WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp));
uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size; uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
int num_used = image_histo_raw_size; int num_used = image_histo_raw_size;
if (orig_histo == NULL || map_tmp == NULL) { if (orig_histo == NULL || map_tmp == NULL) {
@ -1201,7 +1231,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
if (entropy_combine) { if (entropy_combine) {
uint16_t* const bin_map = map_tmp; uint16_t* const bin_map = map_tmp;
const float combine_cost_factor = const int32_t combine_cost_factor =
GetCombineCostFactor(image_histo_raw_size, quality); GetCombineCostFactor(image_histo_raw_size, quality);
const uint32_t num_clusters = num_used; const uint32_t num_clusters = num_used;
@ -1217,9 +1247,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
// Don't combine the histograms using stochastic and greedy heuristics for // Don't combine the histograms using stochastic and greedy heuristics for
// low-effort compression mode. // low-effort compression mode.
if (!low_effort || !entropy_combine) { if (!low_effort || !entropy_combine) {
const float x = quality / 100.f;
// cubic ramp between 1 and MAX_HISTO_GREEDY: // cubic ramp between 1 and MAX_HISTO_GREEDY:
const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1)); const int threshold_size =
(int)(1 + DivRound(quality * quality * quality * (MAX_HISTO_GREEDY - 1),
100 * 100 * 100));
int do_greedy; int do_greedy;
if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size, if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
&do_greedy)) { &do_greedy)) {

View File

@ -40,10 +40,10 @@ typedef struct {
int palette_code_bits_; int palette_code_bits_;
uint32_t trivial_symbol_; // True, if histograms for Red, Blue & Alpha uint32_t trivial_symbol_; // True, if histograms for Red, Blue & Alpha
// literal symbols are single valued. // literal symbols are single valued.
float bit_cost_; // cached value of bit cost. uint64_t bit_cost_; // cached value of bit cost.
float literal_cost_; // Cached values of dominant entropy costs: uint64_t literal_cost_; // Cached values of dominant entropy costs:
float red_cost_; // literal, red & blue. uint64_t red_cost_; // literal, red & blue.
float blue_cost_; uint64_t blue_cost_;
uint8_t is_used_[5]; // 5 for literal, red, blue, alpha, distance uint8_t is_used_[5]; // 5 for literal, red, blue, alpha, distance
} VP8LHistogram; } VP8LHistogram;
@ -64,9 +64,6 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
const VP8LBackwardRefs* const refs, const VP8LBackwardRefs* const refs,
int palette_code_bits); int palette_code_bits);
// Return the size of the histogram for a given cache_bits.
int VP8LGetHistogramSize(int cache_bits);
// Set the palette_code_bits and reset the stats. // Set the palette_code_bits and reset the stats.
// If init_arrays is true, the arrays are also filled with 0's. // If init_arrays is true, the arrays are also filled with 0's.
void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits, void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
@ -112,16 +109,16 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
int low_effort, int histogram_bits, int cache_bits, int low_effort, int histogram_bits, int cache_bits,
VP8LHistogramSet* const image_histo, VP8LHistogramSet* const image_histo,
VP8LHistogram* const tmp_histo, VP8LHistogram* const tmp_histo,
uint16_t* const histogram_symbols, uint32_t* const histogram_symbols,
const WebPPicture* const pic, int percent_range, const WebPPicture* const pic, int percent_range,
int* const percent); int* const percent);
// Returns the entropy for the symbols in the input array. // Returns the entropy for the symbols in the input array.
float VP8LBitsEntropy(const uint32_t* const array, int n); uint64_t VP8LBitsEntropy(const uint32_t* const array, int n);
// Estimate how many bits the combined entropy of literals and distance // Estimate how many bits the combined entropy of literals and distance
// approximately maps to. // approximately maps to.
float VP8LHistogramEstimateBits(VP8LHistogram* const p); uint64_t VP8LHistogramEstimateBits(VP8LHistogram* const p);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -13,6 +13,7 @@
#include <string.h> #include <string.h>
#include "src/dsp/cpu.h"
#include "src/enc/vp8i_enc.h" #include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -54,7 +55,8 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
InitLeft(it); InitLeft(it);
} }
void VP8IteratorReset(VP8EncIterator* const it) { // restart a scan
static void VP8IteratorReset(VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_; VP8Encoder* const enc = it->enc_;
VP8IteratorSetRow(it, 0); VP8IteratorSetRow(it, 0);
VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_); // default VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_); // default
@ -424,6 +426,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15]; it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
} }
} }
#if WEBP_AARCH64 && BPS == 32 && defined(WEBP_MSAN)
// Intra4Preds_NEON() reads 3 uninitialized bytes from i4_boundary_ when top
// is positioned at offset 29 (VP8TopLeftI4[3]). The values are not used
// meaningfully, but due to limitations in MemorySanitizer related to
// modeling of tbl instructions, a warning will be issued. This can be
// removed if MSan is updated to support the instructions. See
// https://issues.webmproject.org/372109644.
memset(it->i4_boundary_ + sizeof(it->i4_boundary_) - 3, 0xaa, 3);
#endif
VP8IteratorNzToBytes(it); // import the non-zero context VP8IteratorNzToBytes(it); // import the non-zero context
} }

View File

@ -14,53 +14,75 @@
// Urvang Joshi (urvang@google.com) // Urvang Joshi (urvang@google.com)
// Vincent Rabaud (vrabaud@google.com) // Vincent Rabaud (vrabaud@google.com)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/enc/vp8i_enc.h" #include "src/enc/vp8i_enc.h"
#include "src/enc/vp8li_enc.h" #include "src/enc/vp8li_enc.h"
#include "src/utils/utils.h"
#include "src/webp/encode.h"
#include "src/webp/format_constants.h"
#include "src/webp/types.h"
#define MAX_DIFF_COST (1e30f) #define HISTO_SIZE (4 * 256)
static const int64_t kSpatialPredictorBias = 15ll << LOG_2_PRECISION_BITS;
static const float kSpatialPredictorBias = 15.f;
static const int kPredLowEffort = 11; static const int kPredLowEffort = 11;
static const uint32_t kMaskAlpha = 0xff000000; static const uint32_t kMaskAlpha = 0xff000000;
static const int kNumPredModes = 14;
// Mostly used to reduce code size + readability // Mostly used to reduce code size + readability
static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; } static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Methods to calculate Entropy (Shannon). // Methods to calculate Entropy (Shannon).
static float PredictionCostSpatial(const int counts[256], int weight_0, // Compute a bias for prediction entropy using a global heuristic to favor
float exp_val) { // values closer to 0. Hence the final negative sign.
// 'exp_val' has a scaling factor of 1/100.
static int64_t PredictionCostBias(const uint32_t counts[256], uint64_t weight_0,
uint64_t exp_val) {
const int significant_symbols = 256 >> 4; const int significant_symbols = 256 >> 4;
const float exp_decay_factor = 0.6f; const uint64_t exp_decay_factor = 6; // has a scaling factor of 1/10
float bits = (float)weight_0 * counts[0]; uint64_t bits = (weight_0 * counts[0]) << LOG_2_PRECISION_BITS;
int i; int i;
exp_val <<= LOG_2_PRECISION_BITS;
for (i = 1; i < significant_symbols; ++i) { for (i = 1; i < significant_symbols; ++i) {
bits += exp_val * (counts[i] + counts[256 - i]); bits += DivRound(exp_val * (counts[i] + counts[256 - i]), 100);
exp_val *= exp_decay_factor; exp_val = DivRound(exp_decay_factor * exp_val, 10);
} }
return (float)(-0.1 * bits); return -DivRound((int64_t)bits, 10);
} }
static float PredictionCostSpatialHistogram(const int accumulated[4][256], static int64_t PredictionCostSpatialHistogram(
const int tile[4][256]) { const uint32_t accumulated[HISTO_SIZE], const uint32_t tile[HISTO_SIZE],
int mode, int left_mode, int above_mode) {
int i; int i;
float retval = 0.f; int64_t retval = 0;
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
const float kExpValue = 0.94f; const uint64_t kExpValue = 94;
retval += PredictionCostSpatial(tile[i], 1, kExpValue); retval += PredictionCostBias(&tile[i * 256], 1, kExpValue);
retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]); // Compute the new cost if 'tile' is added to 'accumulate' but also add the
// cost of the current histogram to guide the spatial predictor selection.
// Basically, favor low entropy, locally and globally.
retval += (int64_t)VP8LCombinedShannonEntropy(&tile[i * 256],
&accumulated[i * 256]);
} }
return (float)retval; // Favor keeping the areas locally similar.
if (mode == left_mode) retval -= kSpatialPredictorBias;
if (mode == above_mode) retval -= kSpatialPredictorBias;
return retval;
} }
static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) { static WEBP_INLINE void UpdateHisto(uint32_t histo_argb[HISTO_SIZE],
++histo_argb[0][argb >> 24]; uint32_t argb) {
++histo_argb[1][(argb >> 16) & 0xff]; ++histo_argb[0 * 256 + (argb >> 24)];
++histo_argb[2][(argb >> 8) & 0xff]; ++histo_argb[1 * 256 + ((argb >> 16) & 0xff)];
++histo_argb[3][argb & 0xff]; ++histo_argb[2 * 256 + ((argb >> 8) & 0xff)];
++histo_argb[3 * 256 + (argb & 0xff)];
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -91,8 +113,6 @@ static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
} }
#if (WEBP_NEAR_LOSSLESS == 1) #if (WEBP_NEAR_LOSSLESS == 1)
static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) { static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24)); const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff)); const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
@ -291,23 +311,80 @@ static WEBP_INLINE void GetResidual(
} }
} }
// Returns best predictor and updates the accumulated histogram. // Accessors to residual histograms.
static WEBP_INLINE uint32_t* GetHistoArgb(uint32_t* const all_histos,
int subsampling_index, int mode) {
return &all_histos[(subsampling_index * kNumPredModes + mode) * HISTO_SIZE];
}
static WEBP_INLINE const uint32_t* GetHistoArgbConst(
const uint32_t* const all_histos, int subsampling_index, int mode) {
return &all_histos[subsampling_index * kNumPredModes * HISTO_SIZE +
mode * HISTO_SIZE];
}
// Accessors to accumulated residual histogram.
static WEBP_INLINE uint32_t* GetAccumulatedHisto(uint32_t* all_accumulated,
int subsampling_index) {
return &all_accumulated[subsampling_index * HISTO_SIZE];
}
// Find and store the best predictor for a tile at subsampling
// 'subsampling_index'.
static void GetBestPredictorForTile(const uint32_t* const all_argb,
int subsampling_index, int tile_x,
int tile_y, int tiles_per_row,
uint32_t* all_accumulated_argb,
uint32_t** const all_modes,
uint32_t* const all_pred_histos) {
uint32_t* const accumulated_argb =
GetAccumulatedHisto(all_accumulated_argb, subsampling_index);
uint32_t* const modes = all_modes[subsampling_index];
uint32_t* const pred_histos =
&all_pred_histos[subsampling_index * kNumPredModes];
// Prediction modes of the left and above neighbor tiles.
const int left_mode =
(tile_x > 0) ? (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff
: 0xff;
const int above_mode =
(tile_y > 0) ? (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff
: 0xff;
int mode;
int64_t best_diff = WEBP_INT64_MAX;
uint32_t best_mode = 0;
const uint32_t* best_histo =
GetHistoArgbConst(all_argb, /*subsampling_index=*/0, best_mode);
for (mode = 0; mode < kNumPredModes; ++mode) {
const uint32_t* const histo_argb =
GetHistoArgbConst(all_argb, subsampling_index, mode);
const int64_t cur_diff = PredictionCostSpatialHistogram(
accumulated_argb, histo_argb, mode, left_mode, above_mode);
if (cur_diff < best_diff) {
best_histo = histo_argb;
best_diff = cur_diff;
best_mode = mode;
}
}
// Update the accumulated histogram.
VP8LAddVectorEq(best_histo, accumulated_argb, HISTO_SIZE);
modes[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (best_mode << 8);
++pred_histos[best_mode];
}
// Computes the residuals for the different predictors.
// If max_quantization > 1, assumes that near lossless processing will be // If max_quantization > 1, assumes that near lossless processing will be
// applied, quantizing residuals to multiples of quantization levels up to // applied, quantizing residuals to multiples of quantization levels up to
// max_quantization (the actual quantization level depends on smoothness near // max_quantization (the actual quantization level depends on smoothness near
// the given pixel). // the given pixel).
static int GetBestPredictorForTile(int width, int height, static void ComputeResidualsForTile(
int tile_x, int tile_y, int bits, int width, int height, int tile_x, int tile_y, int min_bits,
int accumulated[4][256], uint32_t update_up_to_index, uint32_t* const all_argb,
uint32_t* const argb_scratch, uint32_t* const argb_scratch, const uint32_t* const argb,
const uint32_t* const argb, int max_quantization, int exact, int used_subtract_green) {
int max_quantization, const int start_x = tile_x << min_bits;
int exact, int used_subtract_green, const int start_y = tile_y << min_bits;
const uint32_t* const modes) { const int tile_size = 1 << min_bits;
const int kNumPredModes = 14;
const int start_x = tile_x << bits;
const int start_y = tile_y << bits;
const int tile_size = 1 << bits;
const int max_y = GetMin(tile_size, height - start_y); const int max_y = GetMin(tile_size, height - start_y);
const int max_x = GetMin(tile_size, width - start_x); const int max_x = GetMin(tile_size, width - start_x);
// Whether there exist columns just outside the tile. // Whether there exist columns just outside the tile.
@ -318,35 +395,20 @@ static int GetBestPredictorForTile(int width, int height,
#if (WEBP_NEAR_LOSSLESS == 1) #if (WEBP_NEAR_LOSSLESS == 1)
const int context_width = max_x + have_left + (max_x < width - start_x); const int context_width = max_x + have_left + (max_x < width - start_x);
#endif #endif
const int tiles_per_row = VP8LSubSampleSize(width, bits);
// Prediction modes of the left and above neighbor tiles.
const int left_mode = (tile_x > 0) ?
(modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
const int above_mode = (tile_y > 0) ?
(modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
// The width of upper_row and current_row is one pixel larger than image width // The width of upper_row and current_row is one pixel larger than image width
// to allow the top right pixel to point to the leftmost pixel of the next row // to allow the top right pixel to point to the leftmost pixel of the next row
// when at the right edge. // when at the right edge.
uint32_t* upper_row = argb_scratch; uint32_t* upper_row = argb_scratch;
uint32_t* current_row = upper_row + width + 1; uint32_t* current_row = upper_row + width + 1;
uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1); uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
float best_diff = MAX_DIFF_COST;
int best_mode = 0;
int mode; int mode;
int histo_stack_1[4][256];
int histo_stack_2[4][256];
// Need pointers to be able to swap arrays. // Need pointers to be able to swap arrays.
int (*histo_argb)[256] = histo_stack_1;
int (*best_histo)[256] = histo_stack_2;
int i, j;
uint32_t residuals[1 << MAX_TRANSFORM_BITS]; uint32_t residuals[1 << MAX_TRANSFORM_BITS];
assert(bits <= MAX_TRANSFORM_BITS);
assert(max_x <= (1 << MAX_TRANSFORM_BITS)); assert(max_x <= (1 << MAX_TRANSFORM_BITS));
for (mode = 0; mode < kNumPredModes; ++mode) { for (mode = 0; mode < kNumPredModes; ++mode) {
float cur_diff;
int relative_y; int relative_y;
memset(histo_argb, 0, sizeof(histo_stack_1)); uint32_t* const histo_argb =
GetHistoArgb(all_argb, /*subsampling_index=*/0, mode);
if (start_y > 0) { if (start_y > 0) {
// Read the row above the tile which will become the first upper_row. // Read the row above the tile which will become the first upper_row.
// Include a pixel to the left if it exists; include a pixel to the right // Include a pixel to the left if it exists; include a pixel to the right
@ -382,41 +444,31 @@ static int GetBestPredictorForTile(int width, int height,
for (relative_x = 0; relative_x < max_x; ++relative_x) { for (relative_x = 0; relative_x < max_x; ++relative_x) {
UpdateHisto(histo_argb, residuals[relative_x]); UpdateHisto(histo_argb, residuals[relative_x]);
} }
} if (update_up_to_index > 0) {
cur_diff = PredictionCostSpatialHistogram( uint32_t subsampling_index;
(const int (*)[256])accumulated, (const int (*)[256])histo_argb); for (subsampling_index = 1; subsampling_index <= update_up_to_index;
// Favor keeping the areas locally similar. ++subsampling_index) {
if (mode == left_mode) cur_diff -= kSpatialPredictorBias; uint32_t* const super_histo =
if (mode == above_mode) cur_diff -= kSpatialPredictorBias; GetHistoArgb(all_argb, subsampling_index, mode);
for (relative_x = 0; relative_x < max_x; ++relative_x) {
if (cur_diff < best_diff) { UpdateHisto(super_histo, residuals[relative_x]);
int (*tmp)[256] = histo_argb; }
histo_argb = best_histo; }
best_histo = tmp; }
best_diff = cur_diff;
best_mode = mode;
} }
} }
for (i = 0; i < 4; i++) {
for (j = 0; j < 256; j++) {
accumulated[i][j] += best_histo[i][j];
}
}
return best_mode;
} }
// Converts pixels of the image to residuals with respect to predictions. // Converts pixels of the image to residuals with respect to predictions.
// If max_quantization > 1, applies near lossless processing, quantizing // If max_quantization > 1, applies near lossless processing, quantizing
// residuals to multiples of quantization levels up to max_quantization // residuals to multiples of quantization levels up to max_quantization
// (the actual quantization level depends on smoothness near the given pixel). // (the actual quantization level depends on smoothness near the given pixel).
static void CopyImageWithPrediction(int width, int height, static void CopyImageWithPrediction(int width, int height, int bits,
int bits, uint32_t* const modes, const uint32_t* const modes,
uint32_t* const argb_scratch, uint32_t* const argb_scratch,
uint32_t* const argb, uint32_t* const argb, int low_effort,
int low_effort, int max_quantization, int max_quantization, int exact,
int exact, int used_subtract_green) { int used_subtract_green) {
const int tiles_per_row = VP8LSubSampleSize(width, bits); const int tiles_per_row = VP8LSubSampleSize(width, bits);
// The width of upper_row and current_row is one pixel larger than image width // The width of upper_row and current_row is one pixel larger than image width
// to allow the top right pixel to point to the leftmost pixel of the next row // to allow the top right pixel to point to the leftmost pixel of the next row
@ -469,47 +521,307 @@ static void CopyImageWithPrediction(int width, int height,
} }
} }
// Checks whether 'image' can be subsampled by finding the biggest power of 2
// squares (defined by 'best_bits') of uniform value it is made out of.
void VP8LOptimizeSampling(uint32_t* const image, int full_width,
int full_height, int bits, int max_bits,
int* best_bits_out) {
int width = VP8LSubSampleSize(full_width, bits);
int height = VP8LSubSampleSize(full_height, bits);
int old_width, x, y, square_size;
int best_bits = bits;
*best_bits_out = bits;
// Check rows first.
while (best_bits < max_bits) {
const int new_square_size = 1 << (best_bits + 1 - bits);
int is_good = 1;
square_size = 1 << (best_bits - bits);
for (y = 0; y + square_size < height; y += new_square_size) {
// Check the first lines of consecutive line groups.
if (memcmp(&image[y * width], &image[(y + square_size) * width],
width * sizeof(*image)) != 0) {
is_good = 0;
break;
}
}
if (is_good) {
++best_bits;
} else {
break;
}
}
if (best_bits == bits) return;
// Check columns.
while (best_bits > bits) {
int is_good = 1;
square_size = 1 << (best_bits - bits);
for (y = 0; is_good && y < height; ++y) {
for (x = 0; is_good && x < width; x += square_size) {
int i;
for (i = x + 1; i < GetMin(x + square_size, width); ++i) {
if (image[y * width + i] != image[y * width + x]) {
is_good = 0;
break;
}
}
}
}
if (is_good) {
break;
}
--best_bits;
}
if (best_bits == bits) return;
// Subsample the image.
old_width = width;
square_size = 1 << (best_bits - bits);
width = VP8LSubSampleSize(full_width, best_bits);
height = VP8LSubSampleSize(full_height, best_bits);
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
image[y * width + x] = image[square_size * (y * old_width + x)];
}
}
*best_bits_out = best_bits;
}
// Computes the best predictor image.
// Finds the best predictors per tile. Once done, finds the best predictor image
// sampling.
// best_bits is set to 0 in case of error.
// The following requires some glossary:
// - a tile is a square of side 2^min_bits pixels.
// - a super-tile of a tile is a square of side 2^bits pixels with bits in
// [min_bits+1, max_bits].
// - the max-tile of a tile is the square of 2^max_bits pixels containing it.
// If this max-tile crosses the border of an image, it is cropped.
// - tile, super-tiles and max_tile are aligned on powers of 2 in the original
// image.
// - coordinates for tile, super-tile, max-tile are respectively named
// tile_x, super_tile_x, max_tile_x at their bit scale.
// - in the max-tile, a tile has local coordinates (local_tile_x, local_tile_y).
// The tiles are processed in the following zigzag order to complete the
// super-tiles as soon as possible:
// 1 2| 5 6
// 3 4| 7 8
// --------------
// 9 10| 13 14
// 11 12| 15 16
// When computing the residuals for a tile, the histogram of the above
// super-tile is updated. If this super-tile is finished, its histogram is used
// to update the histogram of the next super-tile and so on up to the max-tile.
static void GetBestPredictorsAndSubSampling(
int width, int height, const int min_bits, const int max_bits,
uint32_t* const argb_scratch, const uint32_t* const argb,
int max_quantization, int exact, int used_subtract_green,
const WebPPicture* const pic, int percent_range, int* const percent,
uint32_t** const all_modes, int* best_bits, uint32_t** best_mode) {
const uint32_t tiles_per_row = VP8LSubSampleSize(width, min_bits);
const uint32_t tiles_per_col = VP8LSubSampleSize(height, min_bits);
int64_t best_cost;
uint32_t subsampling_index;
const uint32_t max_subsampling_index = max_bits - min_bits;
// Compute the needed memory size for residual histograms, accumulated
// residual histograms and predictor histograms.
const int num_argb = (max_subsampling_index + 1) * kNumPredModes * HISTO_SIZE;
const int num_accumulated_rgb = (max_subsampling_index + 1) * HISTO_SIZE;
const int num_predictors = (max_subsampling_index + 1) * kNumPredModes;
uint32_t* const raw_data = (uint32_t*)WebPSafeCalloc(
num_argb + num_accumulated_rgb + num_predictors, sizeof(uint32_t));
uint32_t* const all_argb = raw_data;
uint32_t* const all_accumulated_argb = all_argb + num_argb;
uint32_t* const all_pred_histos = all_accumulated_argb + num_accumulated_rgb;
const int max_tile_size = 1 << max_subsampling_index; // in tile size
int percent_start = *percent;
// When using the residuals of a tile for its super-tiles, you can either:
// - use each residual to update the histogram of the super-tile, with a cost
// of 4 * (1<<n)^2 increment operations (4 for the number of channels, and
// (1<<n)^2 for the number of pixels in the tile)
// - use the histogram of the tile to update the histogram of the super-tile,
// with a cost of HISTO_SIZE (1024)
// The first method is therefore faster until n==4. 'update_up_to_index'
// defines the maximum subsampling_index for which the residuals should be
// individually added to the super-tile histogram.
const uint32_t update_up_to_index =
GetMax(GetMin(4, max_bits), min_bits) - min_bits;
// Coordinates in the max-tile in tile units.
uint32_t local_tile_x = 0, local_tile_y = 0;
uint32_t max_tile_x = 0, max_tile_y = 0;
uint32_t tile_x = 0, tile_y = 0;
*best_bits = 0;
*best_mode = NULL;
if (raw_data == NULL) return;
while (tile_y < tiles_per_col) {
ComputeResidualsForTile(width, height, tile_x, tile_y, min_bits,
update_up_to_index, all_argb, argb_scratch, argb,
max_quantization, exact, used_subtract_green);
// Update all the super-tiles that are complete.
subsampling_index = 0;
while (1) {
const uint32_t super_tile_x = tile_x >> subsampling_index;
const uint32_t super_tile_y = tile_y >> subsampling_index;
const uint32_t super_tiles_per_row =
VP8LSubSampleSize(width, min_bits + subsampling_index);
GetBestPredictorForTile(all_argb, subsampling_index, super_tile_x,
super_tile_y, super_tiles_per_row,
all_accumulated_argb, all_modes, all_pred_histos);
if (subsampling_index == max_subsampling_index) break;
// Update the following super-tile histogram if it has not been updated
// yet.
++subsampling_index;
if (subsampling_index > update_up_to_index &&
subsampling_index <= max_subsampling_index) {
VP8LAddVectorEq(
GetHistoArgbConst(all_argb, subsampling_index - 1, /*mode=*/0),
GetHistoArgb(all_argb, subsampling_index, /*mode=*/0),
HISTO_SIZE * kNumPredModes);
}
// Check whether the super-tile is not complete (if the smallest tile
// is not at the end of a line/column or at the beginning of a super-tile
// of size (1 << subsampling_index)).
if (!((tile_x == (tiles_per_row - 1) ||
(local_tile_x + 1) % (1 << subsampling_index) == 0) &&
(tile_y == (tiles_per_col - 1) ||
(local_tile_y + 1) % (1 << subsampling_index) == 0))) {
--subsampling_index;
// subsampling_index now is the index of the last finished super-tile.
break;
}
}
// Reset all the histograms belonging to finished tiles.
memset(all_argb, 0,
HISTO_SIZE * kNumPredModes * (subsampling_index + 1) *
sizeof(*all_argb));
if (subsampling_index == max_subsampling_index) {
// If a new max-tile is started.
if (tile_x == (tiles_per_row - 1)) {
max_tile_x = 0;
++max_tile_y;
} else {
++max_tile_x;
}
local_tile_x = 0;
local_tile_y = 0;
} else {
// Proceed with the Z traversal.
uint32_t coord_x = local_tile_x >> subsampling_index;
uint32_t coord_y = local_tile_y >> subsampling_index;
if (tile_x == (tiles_per_row - 1) && coord_x % 2 == 0) {
++coord_y;
} else {
if (coord_x % 2 == 0) {
++coord_x;
} else {
// Z traversal.
++coord_y;
--coord_x;
}
}
local_tile_x = coord_x << subsampling_index;
local_tile_y = coord_y << subsampling_index;
}
tile_x = max_tile_x * max_tile_size + local_tile_x;
tile_y = max_tile_y * max_tile_size + local_tile_y;
if (tile_x == 0 &&
!WebPReportProgress(
pic, percent_start + percent_range * tile_y / tiles_per_col,
percent)) {
WebPSafeFree(raw_data);
return;
}
}
// Figure out the best sampling.
best_cost = WEBP_INT64_MAX;
for (subsampling_index = 0; subsampling_index <= max_subsampling_index;
++subsampling_index) {
int plane;
const uint32_t* const accumulated =
GetAccumulatedHisto(all_accumulated_argb, subsampling_index);
int64_t cost = VP8LShannonEntropy(
&all_pred_histos[subsampling_index * kNumPredModes], kNumPredModes);
for (plane = 0; plane < 4; ++plane) {
cost += VP8LShannonEntropy(&accumulated[plane * 256], 256);
}
if (cost < best_cost) {
best_cost = cost;
*best_bits = min_bits + subsampling_index;
*best_mode = all_modes[subsampling_index];
}
}
WebPSafeFree(raw_data);
VP8LOptimizeSampling(*best_mode, width, height, *best_bits,
MAX_TRANSFORM_BITS, best_bits);
}
// Finds the best predictor for each tile, and converts the image to residuals // Finds the best predictor for each tile, and converts the image to residuals
// with respect to predictions. If near_lossless_quality < 100, applies // with respect to predictions. If near_lossless_quality < 100, applies
// near lossless processing, shaving off more bits of residuals for lower // near lossless processing, shaving off more bits of residuals for lower
// qualities. // qualities.
int VP8LResidualImage(int width, int height, int bits, int low_effort, int VP8LResidualImage(int width, int height, int min_bits, int max_bits,
uint32_t* const argb, uint32_t* const argb_scratch, int low_effort, uint32_t* const argb,
uint32_t* const image, int near_lossless_quality, uint32_t* const argb_scratch, uint32_t* const image,
int exact, int used_subtract_green, int near_lossless_quality, int exact,
const WebPPicture* const pic, int percent_range, int used_subtract_green, const WebPPicture* const pic,
int* const percent) { int percent_range, int* const percent,
const int tiles_per_row = VP8LSubSampleSize(width, bits); int* const best_bits) {
const int tiles_per_col = VP8LSubSampleSize(height, bits);
int percent_start = *percent; int percent_start = *percent;
int tile_y;
int histo[4][256];
const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality); const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
if (low_effort) { if (low_effort) {
const int tiles_per_row = VP8LSubSampleSize(width, max_bits);
const int tiles_per_col = VP8LSubSampleSize(height, max_bits);
int i; int i;
for (i = 0; i < tiles_per_row * tiles_per_col; ++i) { for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
image[i] = ARGB_BLACK | (kPredLowEffort << 8); image[i] = ARGB_BLACK | (kPredLowEffort << 8);
} }
*best_bits = max_bits;
} else { } else {
memset(histo, 0, sizeof(histo)); // Allocate data to try all samplings from min_bits to max_bits.
for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) { int bits;
int tile_x; uint32_t sum_num_pixels = 0u;
for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) { uint32_t *modes_raw, *best_mode;
const int pred = GetBestPredictorForTile( uint32_t* modes[MAX_TRANSFORM_BITS + 1];
width, height, tile_x, tile_y, bits, histo, argb_scratch, argb, uint32_t num_pixels[MAX_TRANSFORM_BITS + 1];
max_quantization, exact, used_subtract_green, image); for (bits = min_bits; bits <= max_bits; ++bits) {
image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8); const int tiles_per_row = VP8LSubSampleSize(width, bits);
} const int tiles_per_col = VP8LSubSampleSize(height, bits);
num_pixels[bits] = tiles_per_row * tiles_per_col;
if (!WebPReportProgress( sum_num_pixels += num_pixels[bits];
pic, percent_start + percent_range * tile_y / tiles_per_col,
percent)) {
return 0;
}
} }
modes_raw = (uint32_t*)WebPSafeMalloc(sum_num_pixels, sizeof(*modes_raw));
if (modes_raw == NULL) return 0;
// Have modes point to the right global memory modes_raw.
modes[min_bits] = modes_raw;
for (bits = min_bits + 1; bits <= max_bits; ++bits) {
modes[bits] = modes[bits - 1] + num_pixels[bits - 1];
}
// Find the best sampling.
GetBestPredictorsAndSubSampling(
width, height, min_bits, max_bits, argb_scratch, argb, max_quantization,
exact, used_subtract_green, pic, percent_range, percent,
&modes[min_bits], best_bits, &best_mode);
if (*best_bits == 0) {
WebPSafeFree(modes_raw);
return 0;
}
// Keep the best predictor image.
memcpy(image, best_mode,
VP8LSubSampleSize(width, *best_bits) *
VP8LSubSampleSize(height, *best_bits) * sizeof(*image));
WebPSafeFree(modes_raw);
} }
CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb, CopyImageWithPrediction(width, height, *best_bits, image, argb_scratch, argb,
low_effort, max_quantization, exact, low_effort, max_quantization, exact,
used_subtract_green); used_subtract_green);
return WebPReportProgress(pic, percent_start + percent_range, percent); return WebPReportProgress(pic, percent_start + percent_range, percent);
@ -539,48 +851,51 @@ static WEBP_INLINE uint32_t MultipliersToColorCode(
m->green_to_red_; m->green_to_red_;
} }
static float PredictionCostCrossColor(const int accumulated[256], static int64_t PredictionCostCrossColor(const uint32_t accumulated[256],
const int counts[256]) { const uint32_t counts[256]) {
// Favor low entropy, locally and globally. // Favor low entropy, locally and globally.
// Favor small absolute values for PredictionCostSpatial // Favor small absolute values for PredictionCostSpatial
static const float kExpValue = 2.4f; static const uint64_t kExpValue = 240;
return VP8LCombinedShannonEntropy(counts, accumulated) + return (int64_t)VP8LCombinedShannonEntropy(counts, accumulated) +
PredictionCostSpatial(counts, 3, kExpValue); PredictionCostBias(counts, 3, kExpValue);
} }
static float GetPredictionCostCrossColorRed( static int64_t GetPredictionCostCrossColorRed(
const uint32_t* argb, int stride, int tile_width, int tile_height, const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
const int accumulated_red_histo[256]) { const uint32_t accumulated_red_histo[256]) {
int histo[256] = { 0 }; uint32_t histo[256] = { 0 };
float cur_diff; int64_t cur_diff;
VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height, VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
green_to_red, histo); green_to_red, histo);
cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo); cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
if ((uint8_t)green_to_red == prev_x.green_to_red_) { if ((uint8_t)green_to_red == prev_x.green_to_red_) {
cur_diff -= 3; // favor keeping the areas locally similar // favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if ((uint8_t)green_to_red == prev_y.green_to_red_) { if ((uint8_t)green_to_red == prev_y.green_to_red_) {
cur_diff -= 3; // favor keeping the areas locally similar // favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if (green_to_red == 0) { if (green_to_red == 0) {
cur_diff -= 3; cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
return cur_diff; return cur_diff;
} }
static void GetBestGreenToRed( static void GetBestGreenToRed(const uint32_t* argb, int stride, int tile_width,
const uint32_t* argb, int stride, int tile_width, int tile_height, int tile_height, VP8LMultipliers prev_x,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality, VP8LMultipliers prev_y, int quality,
const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) { const uint32_t accumulated_red_histo[256],
VP8LMultipliers* const best_tx) {
const int kMaxIters = 4 + ((7 * quality) >> 8); // in range [4..6] const int kMaxIters = 4 + ((7 * quality) >> 8); // in range [4..6]
int green_to_red_best = 0; int green_to_red_best = 0;
int iter, offset; int iter, offset;
float best_diff = GetPredictionCostCrossColorRed( int64_t best_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y, argb, stride, tile_width, tile_height, prev_x, prev_y, green_to_red_best,
green_to_red_best, accumulated_red_histo); accumulated_red_histo);
for (iter = 0; iter < kMaxIters; ++iter) { for (iter = 0; iter < kMaxIters; ++iter) {
// ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
// one in color computation. Having initial delta here as 1 is sufficient // one in color computation. Having initial delta here as 1 is sufficient
@ -589,7 +904,7 @@ static void GetBestGreenToRed(
// Try a negative and a positive delta from the best known value. // Try a negative and a positive delta from the best known value.
for (offset = -delta; offset <= delta; offset += 2 * delta) { for (offset = -delta; offset <= delta; offset += 2 * delta) {
const int green_to_red_cur = offset + green_to_red_best; const int green_to_red_cur = offset + green_to_red_best;
const float cur_diff = GetPredictionCostCrossColorRed( const int64_t cur_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y, argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_red_cur, accumulated_red_histo); green_to_red_cur, accumulated_red_histo);
if (cur_diff < best_diff) { if (cur_diff < best_diff) {
@ -601,45 +916,50 @@ static void GetBestGreenToRed(
best_tx->green_to_red_ = (green_to_red_best & 0xff); best_tx->green_to_red_ = (green_to_red_best & 0xff);
} }
static float GetPredictionCostCrossColorBlue( static int64_t GetPredictionCostCrossColorBlue(
const uint32_t* argb, int stride, int tile_width, int tile_height, const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_blue,
int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) { int red_to_blue, const uint32_t accumulated_blue_histo[256]) {
int histo[256] = { 0 }; uint32_t histo[256] = { 0 };
float cur_diff; int64_t cur_diff;
VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height, VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
green_to_blue, red_to_blue, histo); green_to_blue, red_to_blue, histo);
cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo); cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
if ((uint8_t)green_to_blue == prev_x.green_to_blue_) { if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar // favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if ((uint8_t)green_to_blue == prev_y.green_to_blue_) { if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar // favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if ((uint8_t)red_to_blue == prev_x.red_to_blue_) { if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar // favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if ((uint8_t)red_to_blue == prev_y.red_to_blue_) { if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar // favor keeping the areas locally similar
cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if (green_to_blue == 0) { if (green_to_blue == 0) {
cur_diff -= 3; cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
if (red_to_blue == 0) { if (red_to_blue == 0) {
cur_diff -= 3; cur_diff -= 3ll << LOG_2_PRECISION_BITS;
} }
return cur_diff; return cur_diff;
} }
#define kGreenRedToBlueNumAxis 8 #define kGreenRedToBlueNumAxis 8
#define kGreenRedToBlueMaxIters 7 #define kGreenRedToBlueMaxIters 7
static void GetBestGreenRedToBlue( static void GetBestGreenRedToBlue(const uint32_t* argb, int stride,
const uint32_t* argb, int stride, int tile_width, int tile_height, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality, VP8LMultipliers prev_x,
const int accumulated_blue_histo[256], VP8LMultipliers prev_y, int quality,
VP8LMultipliers* const best_tx) { const uint32_t accumulated_blue_histo[256],
VP8LMultipliers* const best_tx) {
const int8_t offset[kGreenRedToBlueNumAxis][2] = const int8_t offset[kGreenRedToBlueNumAxis][2] =
{{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}}; {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 }; const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
@ -649,9 +969,9 @@ static void GetBestGreenRedToBlue(
int red_to_blue_best = 0; int red_to_blue_best = 0;
int iter; int iter;
// Initial value at origin: // Initial value at origin:
float best_diff = GetPredictionCostCrossColorBlue( int64_t best_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y, argb, stride, tile_width, tile_height, prev_x, prev_y, green_to_blue_best,
green_to_blue_best, red_to_blue_best, accumulated_blue_histo); red_to_blue_best, accumulated_blue_histo);
for (iter = 0; iter < iters; ++iter) { for (iter = 0; iter < iters; ++iter) {
const int delta = delta_lut[iter]; const int delta = delta_lut[iter];
int axis; int axis;
@ -659,7 +979,7 @@ static void GetBestGreenRedToBlue(
const int green_to_blue_cur = const int green_to_blue_cur =
offset[axis][0] * delta + green_to_blue_best; offset[axis][0] * delta + green_to_blue_best;
const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best; const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
const float cur_diff = GetPredictionCostCrossColorBlue( const int64_t cur_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y, argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo); green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
if (cur_diff < best_diff) { if (cur_diff < best_diff) {
@ -684,13 +1004,10 @@ static void GetBestGreenRedToBlue(
#undef kGreenRedToBlueNumAxis #undef kGreenRedToBlueNumAxis
static VP8LMultipliers GetBestColorTransformForTile( static VP8LMultipliers GetBestColorTransformForTile(
int tile_x, int tile_y, int bits, int tile_x, int tile_y, int bits, VP8LMultipliers prev_x,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality, int xsize, int ysize,
VP8LMultipliers prev_y, const uint32_t accumulated_red_histo[256],
int quality, int xsize, int ysize, const uint32_t accumulated_blue_histo[256], const uint32_t* const argb) {
const int accumulated_red_histo[256],
const int accumulated_blue_histo[256],
const uint32_t* const argb) {
const int max_tile_size = 1 << bits; const int max_tile_size = 1 << bits;
const int tile_y_offset = tile_y * max_tile_size; const int tile_y_offset = tile_y * max_tile_size;
const int tile_x_offset = tile_x * max_tile_size; const int tile_x_offset = tile_x * max_tile_size;
@ -728,13 +1045,13 @@ static void CopyTileWithColorTransform(int xsize, int ysize,
int VP8LColorSpaceTransform(int width, int height, int bits, int quality, int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
uint32_t* const argb, uint32_t* image, uint32_t* const argb, uint32_t* image,
const WebPPicture* const pic, int percent_range, const WebPPicture* const pic, int percent_range,
int* const percent) { int* const percent, int* const best_bits) {
const int max_tile_size = 1 << bits; const int max_tile_size = 1 << bits;
const int tile_xsize = VP8LSubSampleSize(width, bits); const int tile_xsize = VP8LSubSampleSize(width, bits);
const int tile_ysize = VP8LSubSampleSize(height, bits); const int tile_ysize = VP8LSubSampleSize(height, bits);
int percent_start = *percent; int percent_start = *percent;
int accumulated_red_histo[256] = { 0 }; uint32_t accumulated_red_histo[256] = { 0 };
int accumulated_blue_histo[256] = { 0 }; uint32_t accumulated_blue_histo[256] = { 0 };
int tile_x, tile_y; int tile_x, tile_y;
VP8LMultipliers prev_x, prev_y; VP8LMultipliers prev_x, prev_y;
MultipliersClear(&prev_y); MultipliersClear(&prev_y);
@ -788,5 +1105,7 @@ int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
return 0; return 0;
} }
} }
VP8LOptimizeSampling(image, width, height, bits, MAX_TRANSFORM_BITS,
best_bits);
return 1; return 1;
} }

View File

@ -462,7 +462,7 @@ const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
const uint16_t VP8I4ModeOffsets[NUM_BMODES] = { static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
}; };
@ -478,7 +478,9 @@ void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
VP8EncPredChroma8(it->yuv_p_, left, top); VP8EncPredChroma8(it->yuv_p_, left, top);
} }
void VP8MakeIntra4Preds(const VP8EncIterator* const it) { // Form all the ten Intra4x4 predictions in the yuv_p_ cache
// for the 4x4 block it->i4_
static void MakeIntra4Preds(const VP8EncIterator* const it) {
VP8EncPredLuma4(it->yuv_p_, it->i4_top_); VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
} }
@ -1102,7 +1104,7 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer. uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
InitScore(&rd_i4); InitScore(&rd_i4);
VP8MakeIntra4Preds(it); MakeIntra4Preds(it);
for (mode = 0; mode < NUM_BMODES; ++mode) { for (mode = 0; mode < NUM_BMODES; ++mode) {
VP8ModeScore rd_tmp; VP8ModeScore rd_tmp;
int16_t tmp_levels[16]; int16_t tmp_levels[16];
@ -1237,7 +1239,7 @@ static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_]; it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_]; uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
VP8MakeIntra4Preds(it); MakeIntra4Preds(it);
nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_], nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
src, dst, mode) << it->i4_; src, dst, mode) << it->i4_;
} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC)); } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
@ -1305,7 +1307,7 @@ static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
VP8MakeIntra4Preds(it); MakeIntra4Preds(it);
for (mode = 0; mode < NUM_BMODES; ++mode) { for (mode = 0; mode < NUM_BMODES; ++mode) {
const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT

View File

@ -16,6 +16,7 @@
#include <string.h> // for memcpy() #include <string.h> // for memcpy()
#include "src/dec/common_dec.h" #include "src/dec/common_dec.h"
#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h" #include "src/dsp/dsp.h"
#include "src/utils/bit_writer_utils.h" #include "src/utils/bit_writer_utils.h"
#include "src/utils/thread_utils.h" #include "src/utils/thread_utils.h"
@ -31,7 +32,7 @@ extern "C" {
// version numbers // version numbers
#define ENC_MAJ_VERSION 1 #define ENC_MAJ_VERSION 1
#define ENC_MIN_VERSION 4 #define ENC_MIN_VERSION 5
#define ENC_REV_VERSION 0 #define ENC_REV_VERSION 0
enum { MAX_LF_LEVELS = 64, // Maximum loop filter level enum { MAX_LF_LEVELS = 64, // Maximum loop filter level
@ -78,7 +79,6 @@ typedef enum { // Rate-distortion optimization levels
extern const uint16_t VP8Scan[16]; extern const uint16_t VP8Scan[16];
extern const uint16_t VP8UVModeOffsets[4]; extern const uint16_t VP8UVModeOffsets[4];
extern const uint16_t VP8I16ModeOffsets[4]; extern const uint16_t VP8I16ModeOffsets[4];
extern const uint16_t VP8I4ModeOffsets[NUM_BMODES];
// Layout of prediction blocks // Layout of prediction blocks
// intra 16x16 // intra 16x16
@ -234,7 +234,11 @@ typedef struct {
VP8BitWriter* bw_; // current bit-writer VP8BitWriter* bw_; // current bit-writer
uint8_t* preds_; // intra mode predictors (4x4 blocks) uint8_t* preds_; // intra mode predictors (4x4 blocks)
uint32_t* nz_; // non-zero pattern uint32_t* nz_; // non-zero pattern
#if WEBP_AARCH64 && BPS == 32
uint8_t i4_boundary_[40]; // 32+8 boundary samples needed by intra4x4
#else
uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4 uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4
#endif
uint8_t* i4_top_; // pointer to the current top boundary sample uint8_t* i4_top_; // pointer to the current top boundary sample
int i4_; // current intra4x4 mode being tested int i4_; // current intra4x4 mode being tested
int top_nz_[9]; // top-non-zero context. int top_nz_[9]; // top-non-zero context.
@ -267,8 +271,6 @@ typedef struct {
// in iterator.c // in iterator.c
// must be called first // must be called first
void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it); void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
// restart a scan
void VP8IteratorReset(VP8EncIterator* const it);
// reset iterator position to row 'y' // reset iterator position to row 'y'
void VP8IteratorSetRow(VP8EncIterator* const it, int y); void VP8IteratorSetRow(VP8EncIterator* const it, int y);
// set count down (=number of iterations to go) // set count down (=number of iterations to go)
@ -444,9 +446,6 @@ extern const uint8_t VP8Cat6[];
void VP8MakeLuma16Preds(const VP8EncIterator* const it); void VP8MakeLuma16Preds(const VP8EncIterator* const it);
// Form all the four Chroma8x8 predictions in the yuv_p_ cache // Form all the four Chroma8x8 predictions in the yuv_p_ cache
void VP8MakeChroma8Preds(const VP8EncIterator* const it); void VP8MakeChroma8Preds(const VP8EncIterator* const it);
// Form all the ten Intra4x4 predictions in the yuv_p_ cache
// for the 4x4 block it->i4_
void VP8MakeIntra4Preds(const VP8EncIterator* const it);
// Rate calculation // Rate calculation
int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd); int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]); int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);

View File

@ -30,6 +30,10 @@
// Maximum number of histogram images (sub-blocks). // Maximum number of histogram images (sub-blocks).
#define MAX_HUFF_IMAGE_SIZE 2600 #define MAX_HUFF_IMAGE_SIZE 2600
#define MAX_HUFFMAN_BITS (MIN_HUFFMAN_BITS + (1 << NUM_HUFFMAN_BITS) - 1)
// Empirical value for which it becomes too computationally expensive to
// compute the best predictor image.
#define MAX_PREDICTOR_IMAGE_SIZE (1 << 14)
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// Palette // Palette
@ -140,8 +144,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
curr_row += argb_stride; curr_row += argb_stride;
} }
{ {
float entropy_comp[kHistoTotal]; uint64_t entropy_comp[kHistoTotal];
float entropy[kNumEntropyIx]; uint64_t entropy[kNumEntropyIx];
int k; int k;
int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen; int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
int j; int j;
@ -179,19 +183,19 @@ static int AnalyzeEntropy(const uint32_t* argb,
// When including transforms, there is an overhead in bits from // When including transforms, there is an overhead in bits from
// storing them. This overhead is small but matters for small images. // storing them. This overhead is small but matters for small images.
// For spatial, there are 14 transformations. // For spatial, there are 14 transformations.
entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) * entropy[kSpatial] += (uint64_t)VP8LSubSampleSize(width, transform_bits) *
VP8LSubSampleSize(height, transform_bits) * VP8LSubSampleSize(height, transform_bits) *
VP8LFastLog2(14); VP8LFastLog2(14);
// For color transforms: 24 as only 3 channels are considered in a // For color transforms: 24 as only 3 channels are considered in a
// ColorTransformElement. // ColorTransformElement.
entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) * entropy[kSpatialSubGreen] +=
VP8LSubSampleSize(height, transform_bits) * (uint64_t)VP8LSubSampleSize(width, transform_bits) *
VP8LFastLog2(24); VP8LSubSampleSize(height, transform_bits) * VP8LFastLog2(24);
// For palettes, add the cost of storing the palette. // For palettes, add the cost of storing the palette.
// We empirically estimate the cost of a compressed entry as 8 bits. // We empirically estimate the cost of a compressed entry as 8 bits.
// The palette is differential-coded when compressed hence a much // The palette is differential-coded when compressed hence a much
// lower cost than sizeof(uint32_t)*8. // lower cost than sizeof(uint32_t)*8.
entropy[kPalette] += palette_size * 8; entropy[kPalette] += (palette_size * 8ull) << LOG_2_PRECISION_BITS;
*min_entropy_ix = kDirect; *min_entropy_ix = kDirect;
for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) { for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
@ -231,17 +235,33 @@ static int AnalyzeEntropy(const uint32_t* argb,
} }
} }
// Clamp histogram and transform bits.
static int ClampBits(int width, int height, int bits, int min_bits,
int max_bits, int image_size_max) {
int image_size;
bits = (bits < min_bits) ? min_bits : (bits > max_bits) ? max_bits : bits;
image_size = VP8LSubSampleSize(width, bits) * VP8LSubSampleSize(height, bits);
while (bits < max_bits && image_size > image_size_max) {
++bits;
image_size =
VP8LSubSampleSize(width, bits) * VP8LSubSampleSize(height, bits);
}
// In case the bits reduce the image too much, choose the smallest value
// setting the histogram image size to 1.
while (bits > min_bits && image_size == 1) {
image_size = VP8LSubSampleSize(width, bits - 1) *
VP8LSubSampleSize(height, bits - 1);
if (image_size != 1) break;
--bits;
}
return bits;
}
static int GetHistoBits(int method, int use_palette, int width, int height) { static int GetHistoBits(int method, int use_palette, int width, int height) {
// Make tile size a function of encoding method (Range: 0 to 6). // Make tile size a function of encoding method (Range: 0 to 6).
int histo_bits = (use_palette ? 9 : 7) - method; const int histo_bits = (use_palette ? 9 : 7) - method;
while (1) { return ClampBits(width, height, histo_bits, MIN_HUFFMAN_BITS,
const int huff_image_size = VP8LSubSampleSize(width, histo_bits) * MAX_HUFFMAN_BITS, MAX_HUFF_IMAGE_SIZE);
VP8LSubSampleSize(height, histo_bits);
if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
++histo_bits;
}
return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
(histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
} }
static int GetTransformBits(int method, int histo_bits) { static int GetTransformBits(int method, int histo_bits) {
@ -280,7 +300,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
const int method = config->method; const int method = config->method;
const int low_effort = (config->method == 0); const int low_effort = (config->method == 0);
int i; int i;
int use_palette; int use_palette, transform_bits;
int n_lz77s; int n_lz77s;
// If set to 0, analyze the cache with the computed cache value. If 1, also // If set to 0, analyze the cache with the computed cache value. If 1, also
// analyze with no-cache. // analyze with no-cache.
@ -297,7 +317,9 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
// Empirical bit sizes. // Empirical bit sizes.
enc->histo_bits_ = GetHistoBits(method, use_palette, enc->histo_bits_ = GetHistoBits(method, use_palette,
pic->width, pic->height); pic->width, pic->height);
enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_); transform_bits = GetTransformBits(method, enc->histo_bits_);
enc->predictor_transform_bits_ = transform_bits;
enc->cross_color_transform_bits_ = transform_bits;
if (low_effort) { if (low_effort) {
// AnalyzeEntropy is somewhat slow. // AnalyzeEntropy is somewhat slow.
@ -311,8 +333,8 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
// Try out multiple LZ77 on images with few colors. // Try out multiple LZ77 on images with few colors.
n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1; n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette, if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
enc->palette_size_, enc->transform_bits_, enc->palette_size_, transform_bits, &min_entropy_ix,
&min_entropy_ix, red_and_blue_always_zero)) { red_and_blue_always_zero)) {
return 0; return 0;
} }
if (method == 6 && config->quality == 100) { if (method == 6 && config->quality == 100) {
@ -661,11 +683,12 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits); VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
} }
static int StoreImageToBitMask( static int StoreImageToBitMask(VP8LBitWriter* const bw, int width,
VP8LBitWriter* const bw, int width, int histo_bits, int histo_bits,
const VP8LBackwardRefs* const refs, const VP8LBackwardRefs* const refs,
const uint16_t* histogram_symbols, const uint32_t* histogram_symbols,
const HuffmanTreeCode* const huffman_codes, const WebPPicture* const pic) { const HuffmanTreeCode* const huffman_codes,
const WebPPicture* const pic) {
const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1; const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits); const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
// x and y trace the position in the image. // x and y trace the position in the image.
@ -673,7 +696,7 @@ static int StoreImageToBitMask(
int y = 0; int y = 0;
int tile_x = x & tile_mask; int tile_x = x & tile_mask;
int tile_y = y & tile_mask; int tile_y = y & tile_mask;
int histogram_ix = histogram_symbols[0]; int histogram_ix = (histogram_symbols[0] >> 8) & 0xffff;
const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix; const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
VP8LRefsCursor c = VP8LRefsCursorInit(refs); VP8LRefsCursor c = VP8LRefsCursorInit(refs);
while (VP8LRefsCursorOk(&c)) { while (VP8LRefsCursorOk(&c)) {
@ -681,8 +704,10 @@ static int StoreImageToBitMask(
if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) { if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
tile_x = x & tile_mask; tile_x = x & tile_mask;
tile_y = y & tile_mask; tile_y = y & tile_mask;
histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize + histogram_ix = (histogram_symbols[(y >> histo_bits) * histo_xsize +
(x >> histo_bits)]; (x >> histo_bits)] >>
8) &
0xffff;
codes = huffman_codes + 5 * histogram_ix; codes = huffman_codes + 5 * histogram_ix;
} }
if (PixOrCopyIsLiteral(v)) { if (PixOrCopyIsLiteral(v)) {
@ -738,7 +763,7 @@ static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
VP8LBackwardRefs* refs; VP8LBackwardRefs* refs;
HuffmanTreeToken* tokens = NULL; HuffmanTreeToken* tokens = NULL;
HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}}; HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}};
const uint16_t histogram_symbols[1] = {0}; // only one tree, one symbol const uint32_t histogram_symbols[1] = {0}; // only one tree, one symbol
int cache_bits = 0; int cache_bits = 0;
VP8LHistogramSet* histogram_image = NULL; VP8LHistogramSet* histogram_image = NULL;
HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc( HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
@ -821,32 +846,32 @@ static int EncodeImageInternal(
VP8LBitWriter* const bw, const uint32_t* const argb, VP8LBitWriter* const bw, const uint32_t* const argb,
VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width, VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
int height, int quality, int low_effort, const CrunchConfig* const config, int height, int quality, int low_effort, const CrunchConfig* const config,
int* cache_bits, int histogram_bits, size_t init_byte_position, int* cache_bits, int histogram_bits_in, size_t init_byte_position,
int* const hdr_size, int* const data_size, const WebPPicture* const pic, int* const hdr_size, int* const data_size, const WebPPicture* const pic,
int percent_range, int* const percent) { int percent_range, int* const percent) {
const uint32_t histogram_image_xysize = const uint32_t histogram_image_xysize =
VP8LSubSampleSize(width, histogram_bits) * VP8LSubSampleSize(width, histogram_bits_in) *
VP8LSubSampleSize(height, histogram_bits); VP8LSubSampleSize(height, histogram_bits_in);
int remaining_percent = percent_range; int remaining_percent = percent_range;
int percent_start = *percent; int percent_start = *percent;
VP8LHistogramSet* histogram_image = NULL; VP8LHistogramSet* histogram_image = NULL;
VP8LHistogram* tmp_histo = NULL; VP8LHistogram* tmp_histo = NULL;
int histogram_image_size = 0; uint32_t i, histogram_image_size = 0;
size_t bit_array_size = 0; size_t bit_array_size = 0;
HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc( HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree)); 3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
HuffmanTreeToken* tokens = NULL; HuffmanTreeToken* tokens = NULL;
HuffmanTreeCode* huffman_codes = NULL; HuffmanTreeCode* huffman_codes = NULL;
uint16_t* const histogram_symbols = (uint16_t*)WebPSafeMalloc( uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
histogram_image_xysize, sizeof(*histogram_symbols)); histogram_image_xysize, sizeof(*histogram_argb));
int sub_configs_idx; int sub_configs_idx;
int cache_bits_init, write_histogram_image; int cache_bits_init, write_histogram_image;
VP8LBitWriter bw_init = *bw, bw_best; VP8LBitWriter bw_init = *bw, bw_best;
int hdr_size_tmp; int hdr_size_tmp;
VP8LHashChain hash_chain_histogram; // histogram image hash chain VP8LHashChain hash_chain_histogram; // histogram image hash chain
size_t bw_size_best = ~(size_t)0; size_t bw_size_best = ~(size_t)0;
assert(histogram_bits >= MIN_HUFFMAN_BITS); assert(histogram_bits_in >= MIN_HUFFMAN_BITS);
assert(histogram_bits <= MAX_HUFFMAN_BITS); assert(histogram_bits_in <= MAX_HUFFMAN_BITS);
assert(hdr_size != NULL); assert(hdr_size != NULL);
assert(data_size != NULL); assert(data_size != NULL);
@ -857,7 +882,7 @@ static int EncodeImageInternal(
} }
// Make sure we can allocate the different objects. // Make sure we can allocate the different objects.
if (huff_tree == NULL || histogram_symbols == NULL || if (huff_tree == NULL || histogram_argb == NULL ||
!VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) { !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) {
WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY); WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
goto Error; goto Error;
@ -899,6 +924,7 @@ static int EncodeImageInternal(
for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) { for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0; const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
int histogram_bits = histogram_bits_in;
// Speed-up: no need to study the no-cache case if it was already studied // Speed-up: no need to study the no-cache case if it was already studied
// in i_cache == 0. // in i_cache == 0.
if (i_cache == 1 && cache_bits_best == 0) break; if (i_cache == 1 && cache_bits_best == 0) break;
@ -920,7 +946,7 @@ static int EncodeImageInternal(
if (!VP8LGetHistoImageSymbols( if (!VP8LGetHistoImageSymbols(
width, height, &refs_array[i_cache], quality, low_effort, width, height, &refs_array[i_cache], quality, low_effort,
histogram_bits, cache_bits_tmp, histogram_image, tmp_histo, histogram_bits, cache_bits_tmp, histogram_image, tmp_histo,
histogram_symbols, pic, i_percent_range, percent)) { histogram_argb, pic, i_percent_range, percent)) {
goto Error; goto Error;
} }
// Create Huffman bit lengths and codes for each histogram image. // Create Huffman bit lengths and codes for each histogram image.
@ -953,26 +979,19 @@ static int EncodeImageInternal(
} }
// Huffman image + meta huffman. // Huffman image + meta huffman.
histogram_image_size = 0;
for (i = 0; i < histogram_image_xysize; ++i) {
if (histogram_argb[i] >= histogram_image_size) {
histogram_image_size = histogram_argb[i] + 1;
}
histogram_argb[i] <<= 8;
}
write_histogram_image = (histogram_image_size > 1); write_histogram_image = (histogram_image_size > 1);
VP8LPutBits(bw, write_histogram_image, 1); VP8LPutBits(bw, write_histogram_image, 1);
if (write_histogram_image) { if (write_histogram_image) {
uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc( VP8LOptimizeSampling(histogram_argb, width, height, histogram_bits_in,
histogram_image_xysize, sizeof(*histogram_argb)); MAX_HUFFMAN_BITS, &histogram_bits);
int max_index = 0;
uint32_t i;
if (histogram_argb == NULL) {
WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
goto Error;
}
for (i = 0; i < histogram_image_xysize; ++i) {
const int symbol_index = histogram_symbols[i] & 0xffff;
histogram_argb[i] = (symbol_index << 8);
if (symbol_index >= max_index) {
max_index = symbol_index + 1;
}
}
histogram_image_size = max_index;
VP8LPutBits(bw, histogram_bits - 2, 3); VP8LPutBits(bw, histogram_bits - 2, 3);
i_percent_range = i_remaining_percent / 2; i_percent_range = i_remaining_percent / 2;
i_remaining_percent -= i_percent_range; i_remaining_percent -= i_percent_range;
@ -981,15 +1000,12 @@ static int EncodeImageInternal(
VP8LSubSampleSize(width, histogram_bits), VP8LSubSampleSize(width, histogram_bits),
VP8LSubSampleSize(height, histogram_bits), quality, low_effort, VP8LSubSampleSize(height, histogram_bits), quality, low_effort,
pic, i_percent_range, percent)) { pic, i_percent_range, percent)) {
WebPSafeFree(histogram_argb);
goto Error; goto Error;
} }
WebPSafeFree(histogram_argb);
} }
// Store Huffman codes. // Store Huffman codes.
{ {
int i;
int max_tokens = 0; int max_tokens = 0;
// Find maximum number of symbols for the huffman tree-set. // Find maximum number of symbols for the huffman tree-set.
for (i = 0; i < 5 * histogram_image_size; ++i) { for (i = 0; i < 5 * histogram_image_size; ++i) {
@ -1012,7 +1028,7 @@ static int EncodeImageInternal(
// Store actual literals. // Store actual literals.
hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position); hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache], if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
histogram_symbols, huffman_codes, pic)) { histogram_argb, huffman_codes, pic)) {
goto Error; goto Error;
} }
// Keep track of the smallest image so far. // Keep track of the smallest image so far.
@ -1049,7 +1065,7 @@ static int EncodeImageInternal(
WebPSafeFree(huffman_codes->codes); WebPSafeFree(huffman_codes->codes);
WebPSafeFree(huffman_codes); WebPSafeFree(huffman_codes);
} }
WebPSafeFree(histogram_symbols); WebPSafeFree(histogram_argb);
VP8LBitWriterWipeOut(&bw_best); VP8LBitWriterWipeOut(&bw_best);
return (pic->error_code == VP8_ENC_OK); return (pic->error_code == VP8_ENC_OK);
} }
@ -1064,54 +1080,60 @@ static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height); VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
} }
static int ApplyPredictFilter(const VP8LEncoder* const enc, int width, static int ApplyPredictFilter(VP8LEncoder* const enc, int width, int height,
int height, int quality, int low_effort, int quality, int low_effort,
int used_subtract_green, VP8LBitWriter* const bw, int used_subtract_green, VP8LBitWriter* const bw,
int percent_range, int* const percent) { int percent_range, int* const percent) {
const int pred_bits = enc->transform_bits_; int best_bits;
const int transform_width = VP8LSubSampleSize(width, pred_bits);
const int transform_height = VP8LSubSampleSize(height, pred_bits);
// we disable near-lossless quantization if palette is used.
const int near_lossless_strength = const int near_lossless_strength =
enc->use_palette_ ? 100 : enc->config_->near_lossless; enc->use_palette_ ? 100 : enc->config_->near_lossless;
const int max_bits = ClampBits(width, height, enc->predictor_transform_bits_,
MIN_TRANSFORM_BITS, MAX_TRANSFORM_BITS,
MAX_PREDICTOR_IMAGE_SIZE);
const int min_bits = ClampBits(
width, height,
max_bits - 2 * (enc->config_->method > 4 ? enc->config_->method - 4 : 0),
MIN_TRANSFORM_BITS, MAX_TRANSFORM_BITS, MAX_PREDICTOR_IMAGE_SIZE);
if (!VP8LResidualImage( if (!VP8LResidualImage(width, height, min_bits, max_bits, low_effort,
width, height, pred_bits, low_effort, enc->argb_, enc->argb_scratch_, enc->argb_, enc->argb_scratch_, enc->transform_data_,
enc->transform_data_, near_lossless_strength, enc->config_->exact, near_lossless_strength, enc->config_->exact,
used_subtract_green, enc->pic_, percent_range / 2, percent)) { used_subtract_green, enc->pic_, percent_range / 2,
percent, &best_bits)) {
return 0; return 0;
} }
VP8LPutBits(bw, TRANSFORM_PRESENT, 1); VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2); VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
assert(pred_bits >= 2); assert(best_bits >= MIN_TRANSFORM_BITS && best_bits <= MAX_TRANSFORM_BITS);
VP8LPutBits(bw, pred_bits - 2, 3); VP8LPutBits(bw, best_bits - MIN_TRANSFORM_BITS, NUM_TRANSFORM_BITS);
enc->predictor_transform_bits_ = best_bits;
return EncodeImageNoHuffman( return EncodeImageNoHuffman(
bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_, bw, enc->transform_data_, &enc->hash_chain_, &enc->refs_[0],
(VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height, VP8LSubSampleSize(width, best_bits), VP8LSubSampleSize(height, best_bits),
quality, low_effort, enc->pic_, percent_range - percent_range / 2, quality, low_effort, enc->pic_, percent_range - percent_range / 2,
percent); percent);
} }
static int ApplyCrossColorFilter(const VP8LEncoder* const enc, int width, static int ApplyCrossColorFilter(VP8LEncoder* const enc, int width, int height,
int height, int quality, int low_effort, int quality, int low_effort,
VP8LBitWriter* const bw, int percent_range, VP8LBitWriter* const bw, int percent_range,
int* const percent) { int* const percent) {
const int ccolor_transform_bits = enc->transform_bits_; const int min_bits = enc->cross_color_transform_bits_;
const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits); int best_bits;
const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
if (!VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality, if (!VP8LColorSpaceTransform(width, height, min_bits, quality, enc->argb_,
enc->argb_, enc->transform_data_, enc->pic_, enc->transform_data_, enc->pic_,
percent_range / 2, percent)) { percent_range / 2, percent, &best_bits)) {
return 0; return 0;
} }
VP8LPutBits(bw, TRANSFORM_PRESENT, 1); VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2); VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
assert(ccolor_transform_bits >= 2); assert(best_bits >= MIN_TRANSFORM_BITS && best_bits <= MAX_TRANSFORM_BITS);
VP8LPutBits(bw, ccolor_transform_bits - 2, 3); VP8LPutBits(bw, best_bits - MIN_TRANSFORM_BITS, NUM_TRANSFORM_BITS);
enc->cross_color_transform_bits_ = best_bits;
return EncodeImageNoHuffman( return EncodeImageNoHuffman(
bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_, bw, enc->transform_data_, &enc->hash_chain_, &enc->refs_[0],
(VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height, VP8LSubSampleSize(width, best_bits), VP8LSubSampleSize(height, best_bits),
quality, low_effort, enc->pic_, percent_range - percent_range / 2, quality, low_effort, enc->pic_, percent_range - percent_range / 2,
percent); percent);
} }
@ -1199,8 +1221,8 @@ static int AllocateTransformBuffer(VP8LEncoder* const enc, int width,
: 0; : 0;
const uint64_t transform_data_size = const uint64_t transform_data_size =
(enc->use_predict_ || enc->use_cross_color_) (enc->use_predict_ || enc->use_cross_color_)
? (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) * ? (uint64_t)VP8LSubSampleSize(width, MIN_TRANSFORM_BITS) *
VP8LSubSampleSize(height, enc->transform_bits_) VP8LSubSampleSize(height, MIN_TRANSFORM_BITS)
: 0; : 0;
const uint64_t max_alignment_in_words = const uint64_t max_alignment_in_words =
(WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t); (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
@ -1374,13 +1396,11 @@ static int ApplyPalette(const uint32_t* src, uint32_t src_stride, uint32_t* dst,
#undef APPLY_PALETTE_GREEDY_MAX #undef APPLY_PALETTE_GREEDY_MAX
// Note: Expects "enc->palette_" to be set properly. // Note: Expects "enc->palette_" to be set properly.
static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) { static int MapImageFromPalette(VP8LEncoder* const enc) {
const WebPPicture* const pic = enc->pic_; const WebPPicture* const pic = enc->pic_;
const int width = pic->width; const int width = pic->width;
const int height = pic->height; const int height = pic->height;
const uint32_t* const palette = enc->palette_; const uint32_t* const palette = enc->palette_;
const uint32_t* src = in_place ? enc->argb_ : pic->argb;
const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
const int palette_size = enc->palette_size_; const int palette_size = enc->palette_size_;
int xbits; int xbits;
@ -1395,9 +1415,9 @@ static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) { if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) {
return 0; return 0;
} }
if (!ApplyPalette(src, src_stride, if (!ApplyPalette(pic->argb, pic->argb_stride, enc->argb_,
enc->argb_, enc->current_width_, enc->current_width_, palette, palette_size, width, height,
palette, palette_size, width, height, xbits, pic)) { xbits, pic)) {
return 0; return 0;
} }
enc->argb_content_ = kEncoderPalette; enc->argb_content_ = kEncoderPalette;
@ -1405,24 +1425,31 @@ static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
} }
// Save palette_[] to bitstream. // Save palette_[] to bitstream.
static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort, static int EncodePalette(VP8LBitWriter* const bw, int low_effort,
VP8LEncoder* const enc, VP8LEncoder* const enc, int percent_range,
int percent_range, int* const percent) { int* const percent) {
int i; int i;
uint32_t tmp_palette[MAX_PALETTE_SIZE]; uint32_t tmp_palette[MAX_PALETTE_SIZE];
const int palette_size = enc->palette_size_; const int palette_size = enc->palette_size_;
const uint32_t* const palette = enc->palette_; const uint32_t* const palette = enc->palette_;
// If the last element is 0, do not store it and count on automatic palette
// 0-filling. This can only happen if there is no pixel packing, hence if
// there are strictly more than 16 colors (after 0 is removed).
const uint32_t encoded_palette_size =
(enc->palette_[palette_size - 1] == 0 && palette_size > 17)
? palette_size - 1
: palette_size;
VP8LPutBits(bw, TRANSFORM_PRESENT, 1); VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2); VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE); assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
VP8LPutBits(bw, palette_size - 1, 8); VP8LPutBits(bw, encoded_palette_size - 1, 8);
for (i = palette_size - 1; i >= 1; --i) { for (i = encoded_palette_size - 1; i >= 1; --i) {
tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]); tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
} }
tmp_palette[0] = palette[0]; tmp_palette[0] = palette[0];
return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, return EncodeImageNoHuffman(
&enc->refs_[0], palette_size, 1, /*quality=*/20, bw, tmp_palette, &enc->hash_chain_, &enc->refs_[0], encoded_palette_size,
low_effort, enc->pic_, percent_range, percent); 1, /*quality=*/20, low_effort, enc->pic_, percent_range, percent);
} }
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
@ -1493,7 +1520,6 @@ static int EncodeStreamHook(void* input, void* data2) {
#endif #endif
int hdr_size = 0; int hdr_size = 0;
int data_size = 0; int data_size = 0;
int use_delta_palette = 0;
int idx; int idx;
size_t best_size = ~(size_t)0; size_t best_size = ~(size_t)0;
VP8LBitWriter bw_init = *bw, bw_best; VP8LBitWriter bw_init = *bw, bw_best;
@ -1558,45 +1584,43 @@ static int EncodeStreamHook(void* input, void* data2) {
goto Error; goto Error;
} }
remaining_percent -= percent_range; remaining_percent -= percent_range;
if (!MapImageFromPalette(enc, use_delta_palette)) goto Error; if (!MapImageFromPalette(enc)) goto Error;
// If using a color cache, do not have it bigger than the number of // If using a color cache, do not have it bigger than the number of
// colors. // colors.
if (enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) { if (enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1; enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
} }
} }
if (!use_delta_palette) { // In case image is not packed.
// In case image is not packed. if (enc->argb_content_ != kEncoderNearLossless &&
if (enc->argb_content_ != kEncoderNearLossless && enc->argb_content_ != kEncoderPalette) {
enc->argb_content_ != kEncoderPalette) { if (!MakeInputImageCopy(enc)) goto Error;
if (!MakeInputImageCopy(enc)) goto Error; }
}
// ----------------------------------------------------------------------- // -------------------------------------------------------------------------
// Apply transforms and write transform data. // Apply transforms and write transform data.
if (enc->use_subtract_green_) { if (enc->use_subtract_green_) {
ApplySubtractGreen(enc, enc->current_width_, height, bw); ApplySubtractGreen(enc, enc->current_width_, height, bw);
} }
if (enc->use_predict_) { if (enc->use_predict_) {
percent_range = remaining_percent / 3; percent_range = remaining_percent / 3;
if (!ApplyPredictFilter(enc, enc->current_width_, height, quality, if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
low_effort, enc->use_subtract_green_, bw, low_effort, enc->use_subtract_green_, bw,
percent_range, &percent)) { percent_range, &percent)) {
goto Error; goto Error;
}
remaining_percent -= percent_range;
} }
remaining_percent -= percent_range;
}
if (enc->use_cross_color_) { if (enc->use_cross_color_) {
percent_range = remaining_percent / 2; percent_range = remaining_percent / 2;
if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality, if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
low_effort, bw, percent_range, &percent)) { low_effort, bw, percent_range, &percent)) {
goto Error; goto Error;
}
remaining_percent -= percent_range;
} }
remaining_percent -= percent_range;
} }
VP8LPutBits(bw, !TRANSFORM_PRESENT, 1); // No more transforms. VP8LPutBits(bw, !TRANSFORM_PRESENT, 1); // No more transforms.
@ -1625,7 +1649,8 @@ static int EncodeStreamHook(void* input, void* data2) {
if (enc->use_subtract_green_) stats->lossless_features |= 4; if (enc->use_subtract_green_) stats->lossless_features |= 4;
if (enc->use_palette_) stats->lossless_features |= 8; if (enc->use_palette_) stats->lossless_features |= 8;
stats->histogram_bits = enc->histo_bits_; stats->histogram_bits = enc->histo_bits_;
stats->transform_bits = enc->transform_bits_; stats->transform_bits = enc->predictor_transform_bits_;
stats->cross_color_transform_bits = enc->cross_color_transform_bits_;
stats->cache_bits = enc->cache_bits_; stats->cache_bits = enc->cache_bits_;
stats->palette_size = enc->palette_size_; stats->palette_size = enc->palette_size_;
stats->lossless_size = (int)(best_size - byte_position); stats->lossless_size = (int)(best_size - byte_position);
@ -1735,7 +1760,10 @@ int VP8LEncodeStream(const WebPConfig* const config,
} }
// Copy the values that were computed for the main encoder. // Copy the values that were computed for the main encoder.
enc_side->histo_bits_ = enc_main->histo_bits_; enc_side->histo_bits_ = enc_main->histo_bits_;
enc_side->transform_bits_ = enc_main->transform_bits_; enc_side->predictor_transform_bits_ =
enc_main->predictor_transform_bits_;
enc_side->cross_color_transform_bits_ =
enc_main->cross_color_transform_bits_;
enc_side->palette_size_ = enc_main->palette_size_; enc_side->palette_size_ = enc_main->palette_size_;
memcpy(enc_side->palette_, enc_main->palette_, memcpy(enc_side->palette_, enc_main->palette_,
sizeof(enc_main->palette_)); sizeof(enc_main->palette_));

View File

@ -34,7 +34,7 @@ extern "C" {
#endif #endif
// maximum value of transform_bits_ in VP8LEncoder. // maximum value of transform_bits_ in VP8LEncoder.
#define MAX_TRANSFORM_BITS 6 #define MAX_TRANSFORM_BITS (MIN_TRANSFORM_BITS + (1 << NUM_TRANSFORM_BITS) - 1)
typedef enum { typedef enum {
kEncoderNone = 0, kEncoderNone = 0,
@ -59,7 +59,8 @@ typedef struct {
// Encoding parameters derived from quality parameter. // Encoding parameters derived from quality parameter.
int histo_bits_; int histo_bits_;
int transform_bits_; // <= MAX_TRANSFORM_BITS. int predictor_transform_bits_; // <= MAX_TRANSFORM_BITS
int cross_color_transform_bits_; // <= MAX_TRANSFORM_BITS
int cache_bits_; // If equal to 0, don't use color cache. int cache_bits_; // If equal to 0, don't use color cache.
// Encoding parameters derived from image characteristics. // Encoding parameters derived from image characteristics.
@ -104,16 +105,21 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
// pic and percent are for progress. // pic and percent are for progress.
// Returns false in case of error (stored in pic->error_code). // Returns false in case of error (stored in pic->error_code).
int VP8LResidualImage(int width, int height, int bits, int low_effort, int VP8LResidualImage(int width, int height, int min_bits, int max_bits,
uint32_t* const argb, uint32_t* const argb_scratch, int low_effort, uint32_t* const argb,
uint32_t* const image, int near_lossless, int exact, uint32_t* const argb_scratch, uint32_t* const image,
int used_subtract_green, const WebPPicture* const pic, int near_lossless, int exact, int used_subtract_green,
int percent_range, int* const percent); const WebPPicture* const pic, int percent_range,
int* const percent, int* const best_bits);
int VP8LColorSpaceTransform(int width, int height, int bits, int quality, int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
uint32_t* const argb, uint32_t* image, uint32_t* const argb, uint32_t* image,
const WebPPicture* const pic, int percent_range, const WebPPicture* const pic, int percent_range,
int* const percent); int* const percent, int* const best_bits);
void VP8LOptimizeSampling(uint32_t* const image, int full_width,
int full_height, int bits, int max_bits,
int* best_bits_out);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -191,7 +191,8 @@ int WebPAnimEncoderOptionsInitInternal(WebPAnimEncoderOptions* enc_options,
return 1; return 1;
} }
// This starting value is more fit to WebPCleanupTransparentAreaLossless(). // This value is used to match a later call to WebPReplaceTransparentPixels(),
// making it a no-op for lossless (see WebPEncode()).
#define TRANSPARENT_COLOR 0x00000000 #define TRANSPARENT_COLOR 0x00000000
static void ClearRectangle(WebPPicture* const picture, static void ClearRectangle(WebPPicture* const picture,

View File

@ -28,7 +28,7 @@ extern "C" {
// Defines and constants. // Defines and constants.
#define MUX_MAJ_VERSION 1 #define MUX_MAJ_VERSION 1
#define MUX_MIN_VERSION 4 #define MUX_MIN_VERSION 5
#define MUX_REV_VERSION 0 #define MUX_REV_VERSION 0
// Chunk object. // Chunk object.

View File

@ -223,11 +223,13 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
// Note this padding is historical and differs from demux.c which does not // Note this padding is historical and differs from demux.c which does not
// pad the file size. // pad the file size.
riff_size = SizeWithPadding(riff_size); riff_size = SizeWithPadding(riff_size);
if (riff_size < CHUNK_HEADER_SIZE) goto Err; // Make sure the whole RIFF header is available.
if (riff_size < RIFF_HEADER_SIZE) goto Err;
if (riff_size > size) goto Err; if (riff_size > size) goto Err;
// There's no point in reading past the end of the RIFF chunk. // There's no point in reading past the end of the RIFF chunk. Note riff_size
if (size > riff_size + CHUNK_HEADER_SIZE) { // includes CHUNK_HEADER_SIZE after SizeWithPadding().
size = riff_size + CHUNK_HEADER_SIZE; if (size > riff_size) {
size = riff_size;
} }
end = data + size; end = data + size;

View File

@ -124,7 +124,8 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
#if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \ #if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \
defined(__i386__) || defined(_M_IX86) || \ defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64__) || defined(_M_X64) defined(__x86_64__) || defined(_M_X64) || \
defined(__wasm__)
#define VP8L_USE_FAST_LOAD #define VP8L_USE_FAST_LOAD
#endif #endif

View File

@ -69,6 +69,8 @@ extern "C" {
#define BITS 56 #define BITS 56
#elif defined(__mips__) // MIPS #elif defined(__mips__) // MIPS
#define BITS 24 #define BITS 24
#elif defined(__wasm__) // WASM
#define BITS 56
#else // reasonable default #else // reasonable default
#define BITS 24 #define BITS 24
#endif #endif

View File

@ -191,6 +191,12 @@ static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
// Find greedily always the closest color of the predicted color to minimize // Find greedily always the closest color of the predicted color to minimize
// deltas in the palette. This reduces storage needs since the // deltas in the palette. This reduces storage needs since the
// palette is stored with delta encoding. // palette is stored with delta encoding.
if (num_colors > 17) {
if (palette[0] == 0) {
--num_colors;
SwapColor(&palette[num_colors], &palette[0]);
}
}
for (i = 0; i < num_colors; ++i) { for (i = 0; i < num_colors; ++i) {
int best_ix = i; int best_ix = i;
uint32_t best_score = ~0U; uint32_t best_score = ~0U;
@ -384,8 +390,13 @@ int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
uint32_t* const palette) { uint32_t* const palette) {
switch (method) { switch (method) {
case kSortedDefault: case kSortedDefault:
// Nothing to do, we have already sorted the palette. if (palette_sorted[0] == 0 && num_colors > 17) {
memcpy(palette, palette_sorted, num_colors * sizeof(*palette)); memcpy(palette, palette_sorted + 1,
(num_colors - 1) * sizeof(*palette_sorted));
palette[num_colors - 1] = 0;
} else {
memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
}
return 1; return 1;
case kMinimizeDelta: case kMinimizeDelta:
PaletteSortMinimizeDeltas(palette_sorted, num_colors, palette); PaletteSortMinimizeDeltas(palette_sorted, num_colors, palette);

View File

@ -53,6 +53,8 @@ int GetColorPalette(const struct WebPPicture* const pic,
// Sorts the palette according to the criterion defined by 'method'. // Sorts the palette according to the criterion defined by 'method'.
// 'palette_sorted' is the input palette sorted lexicographically, as done in // 'palette_sorted' is the input palette sorted lexicographically, as done in
// PrepareMapToPalette. Returns 0 on memory allocation error. // PrepareMapToPalette. Returns 0 on memory allocation error.
// For kSortedDefault and kMinimizeDelta methods, 0 (if present) is set as the
// last element to optimize later storage.
int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic, int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
const uint32_t* const palette_sorted, uint32_t num_colors, const uint32_t* const palette_sorted, uint32_t num_colors,
uint32_t* const palette); uint32_t* const palette);

View File

@ -20,7 +20,7 @@
extern "C" { extern "C" {
#endif #endif
#define WEBP_ENCODER_ABI_VERSION 0x020f // MAJOR(8b) + MINOR(8b) #define WEBP_ENCODER_ABI_VERSION 0x0210 // MAJOR(8b) + MINOR(8b)
// Note: forward declaring enumerations is not allowed in (strict) C and C++, // Note: forward declaring enumerations is not allowed in (strict) C and C++,
// the types are left here for reference. // the types are left here for reference.
@ -145,7 +145,7 @@ struct WebPConfig {
// RGB information for better compression. The default // RGB information for better compression. The default
// value is 0. // value is 0.
int use_delta_palette; // reserved for future lossless feature int use_delta_palette; // reserved
int use_sharp_yuv; // if needed, use sharp (and slow) RGB->YUV conversion int use_sharp_yuv; // if needed, use sharp (and slow) RGB->YUV conversion
int qmin; // minimum permissible quality factor int qmin; // minimum permissible quality factor
@ -224,14 +224,15 @@ struct WebPAuxStats {
uint32_t lossless_features; // bit0:predictor bit1:cross-color transform uint32_t lossless_features; // bit0:predictor bit1:cross-color transform
// bit2:subtract-green bit3:color indexing // bit2:subtract-green bit3:color indexing
int histogram_bits; // number of precision bits of histogram int histogram_bits; // number of precision bits of histogram
int transform_bits; // precision bits for transform int transform_bits; // precision bits for predictor transform
int cache_bits; // number of bits for color cache lookup int cache_bits; // number of bits for color cache lookup
int palette_size; // number of color in palette, if used int palette_size; // number of color in palette, if used
int lossless_size; // final lossless size int lossless_size; // final lossless size
int lossless_hdr_size; // lossless header (transform, huffman etc) size int lossless_hdr_size; // lossless header (transform, huffman etc) size
int lossless_data_size; // lossless image data size int lossless_data_size; // lossless image data size
int cross_color_transform_bits; // precision bits for cross-color transform
uint32_t pad[2]; // padding for later use uint32_t pad[1]; // padding for later use
}; };
// Signature for output function. Should return true if writing was successful. // Signature for output function. Should return true if writing was successful.

View File

@ -46,7 +46,12 @@
#define CODE_LENGTH_CODES 19 #define CODE_LENGTH_CODES 19
#define MIN_HUFFMAN_BITS 2 // min number of Huffman bits #define MIN_HUFFMAN_BITS 2 // min number of Huffman bits
#define MAX_HUFFMAN_BITS 9 // max number of Huffman bits #define NUM_HUFFMAN_BITS 3
// the maximum number of bits defining a transform is
// MIN_TRANSFORM_BITS + (1 << NUM_TRANSFORM_BITS) - 1
#define MIN_TRANSFORM_BITS 2
#define NUM_TRANSFORM_BITS 3
#define TRANSFORM_PRESENT 1 // The bit to be written when next data #define TRANSFORM_PRESENT 1 // The bit to be written when next data
// to be read is a transform. // to be read is a transform.

View File

@ -38,11 +38,11 @@ typedef long long int int64_t;
#ifndef WEBP_NODISCARD #ifndef WEBP_NODISCARD
#if defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD #if defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD
#if (defined(__cplusplus) && __cplusplus >= 201700L) || \ #if (defined(__cplusplus) && __cplusplus >= 201703L) || \
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L)
#define WEBP_NODISCARD [[nodiscard]] #define WEBP_NODISCARD [[nodiscard]]
#else #else
// gcc's __has_attribute does not work for enums. // gcc's __attribute__((warn_unused_result)) does not work for enums.
#if defined(__clang__) && defined(__has_attribute) #if defined(__clang__) && defined(__has_attribute)
#if __has_attribute(warn_unused_result) #if __has_attribute(warn_unused_result)
#define WEBP_NODISCARD __attribute__((warn_unused_result)) #define WEBP_NODISCARD __attribute__((warn_unused_result))