Introduce BOP_CMP for optimized comparison

Prior to this commit the `OPTIMIZED_CMP` macro relied on a method lookup
to determine whether `<=>` was overridden. The result of the lookup was
cached, but only for the duration of the specific method that
initialized the cmp_opt_data cache structure.

With this method lookup, `[x,y].max` is slower than doing `x > y ?
x : y` even though there's an optimized instruction for "new array max".
(John noticed somebody a proposed micro-optimization based on this fact
in https://github.com/mastodon/mastodon/pull/19903.)

```rb
a, b = 1, 2
Benchmark.ips do |bm|
  bm.report('conditional') { a > b ? a : b }
  bm.report('method') { [a, b].max }
  bm.compare!
end
```

Before:

```
Comparison:
         conditional: 22603733.2 i/s
              method: 19820412.7 i/s - 1.14x  (± 0.00) slower
```

This commit replaces the method lookup with a new CMP basic op, which
gives the examples above equivalent performance.

After:

```
Comparison:
              method: 24022466.5 i/s
         conditional: 23851094.2 i/s - same-ish: difference falls within
error
```

Relevant benchmarks show an improvement to Array#max and Array#min when
not using the optimized newarray_max instruction as well. They are
noticeably faster for small arrays with the relevant types, and the same
or maybe a touch faster on larger arrays.

```
$ make benchmark COMPARE_RUBY=<master@5958c305> ITEM=array_min
$ make benchmark COMPARE_RUBY=<master@5958c305> ITEM=array_max
```

The benchmarks added in this commit also look generally improved.

Co-authored-by: John Hawthorn <jhawthorn@github.com>
This commit is contained in:
Daniel Colson 2022-11-22 21:16:11 -05:00 committed by John Hawthorn
parent c43951e60e
commit e69b91fae4
12 changed files with 96 additions and 82 deletions

23
array.c
View File

@ -3456,7 +3456,6 @@ rb_ary_rotate_m(int argc, VALUE *argv, VALUE ary)
struct ary_sort_data {
VALUE ary;
VALUE receiver;
struct cmp_opt_data cmp_opt;
};
static VALUE
@ -3502,15 +3501,15 @@ sort_2(const void *ap, const void *bp, void *dummy)
VALUE a = *(const VALUE *)ap, b = *(const VALUE *)bp;
int n;
if (FIXNUM_P(a) && FIXNUM_P(b) && CMP_OPTIMIZABLE(data->cmp_opt, Integer)) {
if (FIXNUM_P(a) && FIXNUM_P(b) && CMP_OPTIMIZABLE(INTEGER)) {
if ((long)a > (long)b) return 1;
if ((long)a < (long)b) return -1;
return 0;
}
if (STRING_P(a) && STRING_P(b) && CMP_OPTIMIZABLE(data->cmp_opt, String)) {
if (STRING_P(a) && STRING_P(b) && CMP_OPTIMIZABLE(STRING)) {
return rb_str_cmp(a, b);
}
if (RB_FLOAT_TYPE_P(a) && CMP_OPTIMIZABLE(data->cmp_opt, Float)) {
if (RB_FLOAT_TYPE_P(a) && CMP_OPTIMIZABLE(FLOAT)) {
return rb_float_cmp(a, b);
}
@ -3574,8 +3573,6 @@ rb_ary_sort_bang(VALUE ary)
RBASIC_CLEAR_CLASS(tmp);
data.ary = tmp;
data.receiver = ary;
data.cmp_opt.opt_methods = 0;
data.cmp_opt.opt_inited = 0;
RARRAY_PTR_USE(tmp, ptr, {
ruby_qsort(ptr, len, sizeof(VALUE),
rb_block_given_p()?sort_1:sort_2, &data);
@ -6056,7 +6053,6 @@ ary_max_opt_string(VALUE ary, long i, VALUE vmax)
static VALUE
rb_ary_max(int argc, VALUE *argv, VALUE ary)
{
struct cmp_opt_data cmp_opt = { 0, 0 };
VALUE result = Qundef, v;
VALUE num;
long i;
@ -6076,13 +6072,13 @@ rb_ary_max(int argc, VALUE *argv, VALUE ary)
else if (n > 0) {
result = RARRAY_AREF(ary, 0);
if (n > 1) {
if (FIXNUM_P(result) && CMP_OPTIMIZABLE(cmp_opt, Integer)) {
if (FIXNUM_P(result) && CMP_OPTIMIZABLE(INTEGER)) {
return ary_max_opt_fixnum(ary, 1, result);
}
else if (STRING_P(result) && CMP_OPTIMIZABLE(cmp_opt, String)) {
else if (STRING_P(result) && CMP_OPTIMIZABLE(STRING)) {
return ary_max_opt_string(ary, 1, result);
}
else if (RB_FLOAT_TYPE_P(result) && CMP_OPTIMIZABLE(cmp_opt, Float)) {
else if (RB_FLOAT_TYPE_P(result) && CMP_OPTIMIZABLE(FLOAT)) {
return ary_max_opt_float(ary, 1, result);
}
else {
@ -6225,7 +6221,6 @@ ary_min_opt_string(VALUE ary, long i, VALUE vmin)
static VALUE
rb_ary_min(int argc, VALUE *argv, VALUE ary)
{
struct cmp_opt_data cmp_opt = { 0, 0 };
VALUE result = Qundef, v;
VALUE num;
long i;
@ -6245,13 +6240,13 @@ rb_ary_min(int argc, VALUE *argv, VALUE ary)
else if (n > 0) {
result = RARRAY_AREF(ary, 0);
if (n > 1) {
if (FIXNUM_P(result) && CMP_OPTIMIZABLE(cmp_opt, Integer)) {
if (FIXNUM_P(result) && CMP_OPTIMIZABLE(INTEGER)) {
return ary_min_opt_fixnum(ary, 1, result);
}
else if (STRING_P(result) && CMP_OPTIMIZABLE(cmp_opt, String)) {
else if (STRING_P(result) && CMP_OPTIMIZABLE(STRING)) {
return ary_min_opt_string(ary, 1, result);
}
else if (RB_FLOAT_TYPE_P(result) && CMP_OPTIMIZABLE(cmp_opt, Float)) {
else if (RB_FLOAT_TYPE_P(result) && CMP_OPTIMIZABLE(FLOAT)) {
return ary_min_opt_float(ary, 1, result);
}
else {

View File

@ -0,0 +1,15 @@
prelude: |
ary2 = 2.times.to_a.shuffle
ary10 = 10.times.to_a.shuffle
ary100 = 100.times.to_a.shuffle
ary1000 = 1000.times.to_a.shuffle
ary10000 = 10000.times.to_a.shuffle
benchmark:
ary2.sort: ary2.sort
ary10.sort: ary10.sort
ary100.sort: ary100.sort
ary1000.sort: ary1000.sort
ary10000.sort: ary10000.sort
loop_count: 10000

25
benchmark/enum_minmax.yml Normal file
View File

@ -0,0 +1,25 @@
prelude: |
set2 = 2.times.to_a.shuffle.to_set
set10 = 10.times.to_a.shuffle.to_set
set100 = 100.times.to_a.shuffle.to_set
set1000 = 1000.times.to_a.shuffle.to_set
set10000 = 10000.times.to_a.shuffle.to_set
benchmark:
set2.min: set2.min
set10.min: set10.min
set100.min: set100.min
set1000.min: set1000.min
set10000.min: set10000.min
set2.max: set2.max
set10.max: set10.max
set100.max: set100.max
set1000.max: set1000.max
set10000.max: set10000.max
set2.minmax: set2.minmax
set10.minmax: set10.minmax
set100.minmax: set100.minmax
set1000.minmax: set1000.minmax
set10000.minmax: set10000.minmax
loop_count: 10000

15
benchmark/enum_sort.yml Normal file
View File

@ -0,0 +1,15 @@
prelude: |
set2 = 2.times.to_a.shuffle.to_set
set10 = 10.times.to_a.shuffle.to_set
set100 = 100.times.to_a.shuffle.to_set
set1000 = 1000.times.to_a.shuffle.to_set
set10000 = 10000.times.to_a.shuffle.to_set
benchmark:
set2.sort_by: set2.sort_by { 0 }
set10.sort_by: set10.sort_by { 0 }
set100.sort_by: set100.sort_by { 0 }
set1000.sort_by: set1000.sort_by { 0 }
set10000.sort_by: set10000.sort_by { 0 }
loop_count: 10000

2
benchmark/range_min.yml Normal file
View File

@ -0,0 +1,2 @@
benchmark:
- (1..10).min

49
enum.c
View File

@ -1373,7 +1373,6 @@ sort_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, _data))
static int
sort_by_cmp(const void *ap, const void *bp, void *data)
{
struct cmp_opt_data cmp_opt = { 0, 0 };
VALUE a;
VALUE b;
VALUE ary = (VALUE)data;
@ -1385,7 +1384,7 @@ sort_by_cmp(const void *ap, const void *bp, void *data)
a = *(VALUE *)ap;
b = *(VALUE *)bp;
return OPTIMIZED_CMP(a, b, cmp_opt);
return OPTIMIZED_CMP(a, b);
}
/*
@ -1713,11 +1712,10 @@ cmpint_reenter_check(struct nmin_data *data, VALUE val)
static int
nmin_cmp(const void *ap, const void *bp, void *_data)
{
struct cmp_opt_data cmp_opt = { 0, 0 };
struct nmin_data *data = (struct nmin_data *)_data;
VALUE a = *(const VALUE *)ap, b = *(const VALUE *)bp;
#define rb_cmpint(cmp, a, b) rb_cmpint(cmpint_reenter_check(data, (cmp)), a, b)
return OPTIMIZED_CMP(a, b, cmp_opt);
return OPTIMIZED_CMP(a, b);
#undef rb_cmpint
}
@ -2027,7 +2025,6 @@ enum_none(int argc, VALUE *argv, VALUE obj)
struct min_t {
VALUE min;
struct cmp_opt_data cmp_opt;
};
static VALUE
@ -2041,7 +2038,7 @@ min_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
memo->min = i;
}
else {
if (OPTIMIZED_CMP(i, memo->min, memo->cmp_opt) < 0) {
if (OPTIMIZED_CMP(i, memo->min) < 0) {
memo->min = i;
}
}
@ -2130,7 +2127,7 @@ static VALUE
enum_min(int argc, VALUE *argv, VALUE obj)
{
VALUE memo;
struct min_t *m = NEW_CMP_OPT_MEMO(struct min_t, memo);
struct min_t *m = NEW_MEMO_FOR(struct min_t, memo);
VALUE result;
VALUE num;
@ -2138,8 +2135,6 @@ enum_min(int argc, VALUE *argv, VALUE obj)
return rb_nmin_run(obj, num, 0, 0, 0);
m->min = Qundef;
m->cmp_opt.opt_methods = 0;
m->cmp_opt.opt_inited = 0;
if (rb_block_given_p()) {
rb_block_call(obj, id_each, 0, 0, min_ii, memo);
}
@ -2153,7 +2148,6 @@ enum_min(int argc, VALUE *argv, VALUE obj)
struct max_t {
VALUE max;
struct cmp_opt_data cmp_opt;
};
static VALUE
@ -2167,7 +2161,7 @@ max_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
memo->max = i;
}
else {
if (OPTIMIZED_CMP(i, memo->max, memo->cmp_opt) > 0) {
if (OPTIMIZED_CMP(i, memo->max) > 0) {
memo->max = i;
}
}
@ -2255,7 +2249,7 @@ static VALUE
enum_max(int argc, VALUE *argv, VALUE obj)
{
VALUE memo;
struct max_t *m = NEW_CMP_OPT_MEMO(struct max_t, memo);
struct max_t *m = NEW_MEMO_FOR(struct max_t, memo);
VALUE result;
VALUE num;
@ -2263,8 +2257,6 @@ enum_max(int argc, VALUE *argv, VALUE obj)
return rb_nmin_run(obj, num, 0, 1, 0);
m->max = Qundef;
m->cmp_opt.opt_methods = 0;
m->cmp_opt.opt_inited = 0;
if (rb_block_given_p()) {
rb_block_call(obj, id_each, 0, 0, max_ii, (VALUE)memo);
}
@ -2280,7 +2272,6 @@ struct minmax_t {
VALUE min;
VALUE max;
VALUE last;
struct cmp_opt_data cmp_opt;
};
static void
@ -2293,11 +2284,11 @@ minmax_i_update(VALUE i, VALUE j, struct minmax_t *memo)
memo->max = j;
}
else {
n = OPTIMIZED_CMP(i, memo->min, memo->cmp_opt);
n = OPTIMIZED_CMP(i, memo->min);
if (n < 0) {
memo->min = i;
}
n = OPTIMIZED_CMP(j, memo->max, memo->cmp_opt);
n = OPTIMIZED_CMP(j, memo->max);
if (n > 0) {
memo->max = j;
}
@ -2320,7 +2311,7 @@ minmax_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, _memo))
j = memo->last;
memo->last = Qundef;
n = OPTIMIZED_CMP(j, i, memo->cmp_opt);
n = OPTIMIZED_CMP(j, i);
if (n == 0)
i = j;
else if (n < 0) {
@ -2422,12 +2413,10 @@ static VALUE
enum_minmax(VALUE obj)
{
VALUE memo;
struct minmax_t *m = NEW_CMP_OPT_MEMO(struct minmax_t, memo);
struct minmax_t *m = NEW_MEMO_FOR(struct minmax_t, memo);
m->min = Qundef;
m->last = Qundef;
m->cmp_opt.opt_methods = 0;
m->cmp_opt.opt_inited = 0;
if (rb_block_given_p()) {
rb_block_call(obj, id_each, 0, 0, minmax_ii, memo);
if (!UNDEF_P(m->last))
@ -2447,7 +2436,6 @@ enum_minmax(VALUE obj)
static VALUE
min_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
{
struct cmp_opt_data cmp_opt = { 0, 0 };
struct MEMO *memo = MEMO_CAST(args);
VALUE v;
@ -2458,7 +2446,7 @@ min_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
MEMO_V1_SET(memo, v);
MEMO_V2_SET(memo, i);
}
else if (OPTIMIZED_CMP(v, memo->v1, cmp_opt) < 0) {
else if (OPTIMIZED_CMP(v, memo->v1) < 0) {
MEMO_V1_SET(memo, v);
MEMO_V2_SET(memo, i);
}
@ -2522,7 +2510,6 @@ enum_min_by(int argc, VALUE *argv, VALUE obj)
static VALUE
max_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
{
struct cmp_opt_data cmp_opt = { 0, 0 };
struct MEMO *memo = MEMO_CAST(args);
VALUE v;
@ -2533,7 +2520,7 @@ max_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
MEMO_V1_SET(memo, v);
MEMO_V2_SET(memo, i);
}
else if (OPTIMIZED_CMP(v, memo->v1, cmp_opt) > 0) {
else if (OPTIMIZED_CMP(v, memo->v1) > 0) {
MEMO_V1_SET(memo, v);
MEMO_V2_SET(memo, i);
}
@ -2606,8 +2593,6 @@ struct minmax_by_t {
static void
minmax_by_i_update(VALUE v1, VALUE v2, VALUE i1, VALUE i2, struct minmax_by_t *memo)
{
struct cmp_opt_data cmp_opt = { 0, 0 };
if (UNDEF_P(memo->min_bv)) {
memo->min_bv = v1;
memo->max_bv = v2;
@ -2615,11 +2600,11 @@ minmax_by_i_update(VALUE v1, VALUE v2, VALUE i1, VALUE i2, struct minmax_by_t *m
memo->max = i2;
}
else {
if (OPTIMIZED_CMP(v1, memo->min_bv, cmp_opt) < 0) {
if (OPTIMIZED_CMP(v1, memo->min_bv) < 0) {
memo->min_bv = v1;
memo->min = i1;
}
if (OPTIMIZED_CMP(v2, memo->max_bv, cmp_opt) > 0) {
if (OPTIMIZED_CMP(v2, memo->max_bv) > 0) {
memo->max_bv = v2;
memo->max = i2;
}
@ -2629,7 +2614,6 @@ minmax_by_i_update(VALUE v1, VALUE v2, VALUE i1, VALUE i2, struct minmax_by_t *m
static VALUE
minmax_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, _memo))
{
struct cmp_opt_data cmp_opt = { 0, 0 };
struct minmax_by_t *memo = MEMO_FOR(struct minmax_by_t, _memo);
VALUE vi, vj, j;
int n;
@ -2647,7 +2631,7 @@ minmax_by_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, _memo))
j = memo->last;
memo->last_bv = Qundef;
n = OPTIMIZED_CMP(vj, vi, cmp_opt);
n = OPTIMIZED_CMP(vj, vi);
if (n == 0) {
i = j;
vi = vj;
@ -3033,7 +3017,6 @@ each_cons_i(RB_BLOCK_CALL_FUNC_ARGLIST(i, args))
static VALUE
enum_each_cons_size(VALUE obj, VALUE args, VALUE eobj)
{
struct cmp_opt_data cmp_opt = { 0, 0 };
const VALUE zero = LONG2FIX(0);
VALUE n, size;
long cons_size = NUM2LONG(RARRAY_AREF(args, 0));
@ -3043,7 +3026,7 @@ enum_each_cons_size(VALUE obj, VALUE args, VALUE eobj)
if (NIL_P(size)) return Qnil;
n = add_int(size, 1 - cons_size);
return (OPTIMIZED_CMP(n, zero, cmp_opt) == -1) ? zero : n;
return (OPTIMIZED_CMP(n, zero) == -1) ? zero : n;
}
/*

View File

@ -34,6 +34,7 @@ enum ruby_basic_operators {
BOP_CALL,
BOP_AND,
BOP_OR,
BOP_CMP,
BOP_LAST_
};

View File

@ -8,38 +8,18 @@
* file COPYING are met. Consult the file for details.
* @brief Internal header for Comparable.
*/
#include "internal/vm.h" /* for rb_method_basic_definition_p */
#include "internal/basic_operators.h"
#define STRING_P(s) (RB_TYPE_P((s), T_STRING) && CLASS_OF(s) == rb_cString)
enum {
cmp_opt_Integer,
cmp_opt_String,
cmp_opt_Float,
cmp_optimizable_count
};
#define CMP_OPTIMIZABLE(type) BASIC_OP_UNREDEFINED_P(BOP_CMP, type##_REDEFINED_OP_FLAG)
struct cmp_opt_data {
unsigned int opt_methods;
unsigned int opt_inited;
};
#define NEW_CMP_OPT_MEMO(type, value) \
NEW_PARTIAL_MEMO_FOR(type, value, cmp_opt)
#define CMP_OPTIMIZABLE_BIT(type) (1U << TOKEN_PASTE(cmp_opt_,type))
#define CMP_OPTIMIZABLE(data, type) \
(((data).opt_inited & CMP_OPTIMIZABLE_BIT(type)) ? \
((data).opt_methods & CMP_OPTIMIZABLE_BIT(type)) : \
(((data).opt_inited |= CMP_OPTIMIZABLE_BIT(type)), \
rb_method_basic_definition_p(TOKEN_PASTE(rb_c,type), id_cmp) && \
((data).opt_methods |= CMP_OPTIMIZABLE_BIT(type))))
#define OPTIMIZED_CMP(a, b, data) \
((FIXNUM_P(a) && FIXNUM_P(b) && CMP_OPTIMIZABLE(data, Integer)) ? \
#define OPTIMIZED_CMP(a, b) \
((FIXNUM_P(a) && FIXNUM_P(b) && CMP_OPTIMIZABLE(INTEGER)) ? \
(((long)a > (long)b) ? 1 : ((long)a < (long)b) ? -1 : 0) : \
(STRING_P(a) && STRING_P(b) && CMP_OPTIMIZABLE(data, String)) ? \
(STRING_P(a) && STRING_P(b) && CMP_OPTIMIZABLE(STRING)) ? \
rb_str_cmp(a, b) : \
(RB_FLOAT_TYPE_P(a) && RB_FLOAT_TYPE_P(b) && CMP_OPTIMIZABLE(data, Float)) ? \
(RB_FLOAT_TYPE_P(a) && RB_FLOAT_TYPE_P(b) && CMP_OPTIMIZABLE(FLOAT)) ? \
rb_float_cmp(a, b) : \
rb_cmpint(rb_funcallv(a, id_cmp, 1, &b), a, b))

View File

@ -1297,10 +1297,9 @@ range_min(int argc, VALUE *argv, VALUE range)
return range_first(argc, argv, range);
}
else {
struct cmp_opt_data cmp_opt = { 0, 0 };
VALUE b = RANGE_BEG(range);
VALUE e = RANGE_END(range);
int c = NIL_P(e) ? -1 : OPTIMIZED_CMP(b, e, cmp_opt);
int c = NIL_P(e) ? -1 : OPTIMIZED_CMP(b, e);
if (c > 0 || (c == 0 && EXCL(range)))
return Qnil;
@ -1408,8 +1407,7 @@ range_max(int argc, VALUE *argv, VALUE range)
return rb_call_super(argc, argv);
}
else {
struct cmp_opt_data cmp_opt = { 0, 0 };
int c = NIL_P(b) ? -1 : OPTIMIZED_CMP(b, e, cmp_opt);
int c = NIL_P(b) ? -1 : OPTIMIZED_CMP(b, e);
if (c > 0)
return Qnil;

1
vm.c
View File

@ -2035,6 +2035,7 @@ vm_init_redefined_flag(void)
OP(And, AND), (C(Integer));
OP(Or, OR), (C(Integer));
OP(NilP, NIL_P), (C(NilClass));
OP(Cmp, CMP), (C(Integer), C(Float), C(String));
#undef C
#undef OP
}

View File

@ -5176,12 +5176,11 @@ vm_opt_newarray_max(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
return Qnil;
}
else {
struct cmp_opt_data cmp_opt = { 0, 0 };
VALUE result = *ptr;
rb_snum_t i = num - 1;
while (i-- > 0) {
const VALUE v = *++ptr;
if (OPTIMIZED_CMP(v, result, cmp_opt) > 0) {
if (OPTIMIZED_CMP(v, result) > 0) {
result = v;
}
}
@ -5201,12 +5200,11 @@ vm_opt_newarray_min(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
return Qnil;
}
else {
struct cmp_opt_data cmp_opt = { 0, 0 };
VALUE result = *ptr;
rb_snum_t i = num - 1;
while (i-- > 0) {
const VALUE v = *++ptr;
if (OPTIMIZED_CMP(v, result, cmp_opt) < 0) {
if (OPTIMIZED_CMP(v, result) < 0) {
result = v;
}
}

View File

@ -720,7 +720,8 @@ pub const BOP_MIN: ruby_basic_operators = 25;
pub const BOP_CALL: ruby_basic_operators = 26;
pub const BOP_AND: ruby_basic_operators = 27;
pub const BOP_OR: ruby_basic_operators = 28;
pub const BOP_LAST_: ruby_basic_operators = 29;
pub const BOP_CMP: ruby_basic_operators = 29;
pub const BOP_LAST_: ruby_basic_operators = 30;
pub type ruby_basic_operators = u32;
pub type rb_serial_t = ::std::os::raw::c_ulonglong;
extern "C" {