ruby/tool/rjit/bindgen.rb

#!/usr/bin/env ruby
# frozen_string_literal: true

ENV['GEM_HOME'] = File.expand_path('./.bundle', __dir__)
require 'rubygems/source'
require 'bundler/inline'
gemfile(true) do
  source 'https://rubygems.org'
  gem 'ffi-clang', '0.7.0', require: false
end

# Help ffi-clang find libclang
# Hint: apt install libclang1
ENV['LIBCLANG'] ||= Dir.glob("/usr/lib/llvm-*/lib/libclang.so.1").grep_v(/-cpp/).sort.last
require 'ffi/clang'

require 'etc'
require 'fiddle/import'
require 'set'

unless build_dir = ARGV.first
  abort "Usage: #{$0} BUILD_DIR"
end

class Node < Struct.new(
  :kind,
  :spelling,
  :type,
  :typedef_type,
  :bitwidth,
  :sizeof_type,
  :offsetof,
  :enum_value,
  :children,
  keyword_init: true,
)
end

# Parse a C header with ffi-clang and return Node objects.
# To ease the maintenance, ffi-clang should be used only inside this class.
class HeaderParser
  def initialize(header, cflags:)
    @translation_unit = FFI::Clang::Index.new.parse_translation_unit(header, cflags, [], {})
  end

  def parse
    parse_children(@translation_unit.cursor)
  end

  private

  def parse_children(cursor)
    children = []
    cursor.visit_children do |cursor, _parent|
      children << parse_cursor(cursor)
      next :continue
    end
    children
  end

  def parse_cursor(cursor)
    unless cursor.kind.start_with?('cursor_')
      raise "unexpected cursor kind: #{cursor.kind}"
    end
    kind = cursor.kind.to_s.delete_prefix('cursor_').to_sym
    children = parse_children(cursor)

    offsetof = {}
    if kind == :struct
      children.select { |c| c.kind == :field_decl }.each do |child|
        offsetof[child.spelling] = cursor.type.offsetof(child.spelling)
      end
    end

    sizeof_type = nil
    if %i[struct union].include?(kind)
      sizeof_type = cursor.type.sizeof
    end

    enum_value = nil
    if kind == :enum_constant_decl
      enum_value = cursor.enum_value
    end

    Node.new(
      kind: kind,
      spelling: cursor.spelling,
      type: cursor.type.spelling,
      typedef_type: cursor.typedef_type.spelling,
      bitwidth: cursor.bitwidth,
      sizeof_type: sizeof_type,
      offsetof: offsetof,
      enum_value: enum_value,
      children: children,
    )
  end
end

# Convert Node objects to a Ruby binding source.
class BindingGenerator
  BINDGEN_BEG = '### RJIT bindgen begin ###'
  BINDGEN_END = '### RJIT bindgen end ###'
  DEFAULTS = { '_Bool' => 'CType::Bool.new' }
  DEFAULTS.default_proc = proc { |_h, k| "CType::Stub.new(:#{k})" }

  attr_reader :src

  # @param src_path [String]
  # @param consts [Hash{ Symbol => Array<String> }]
  # @param values [Hash{ Symbol => Array<String> }]
  # @param funcs [Array<String>]
  # @param types [Array<String>]
  # @param dynamic_types [Array<String>] #ifdef-dependent immediate types, which need Primitive.cexpr! for type detection
  # @param skip_fields [Hash{ Symbol => Array<String> }] Struct fields that are skipped from bindgen
  # @param ruby_fields [Hash{ Symbol => Array<String> }] Struct VALUE fields that are considered Ruby objects
  def initialize(src_path:, consts:, values:, funcs:, types:, dynamic_types:, skip_fields:, ruby_fields:)
    @preamble, @postamble = split_ambles(src_path)
    @src = String.new
    @consts = consts.transform_values(&:sort)
    @values = values.transform_values(&:sort)
    @funcs = funcs.sort
    @types = types.sort
    @dynamic_types = dynamic_types.sort
    @skip_fields = skip_fields.transform_keys(&:to_s)
    @ruby_fields = ruby_fields.transform_keys(&:to_s)
    @references = Set.new
  end

  def generate(nodes)
    println @preamble

    # Define macros/enums
    @consts.each do |type, values|
      values.each do |value|
        raise "#{value} isn't a valid constant name" unless ('A'..'Z').include?(value[0])
        println "  C::#{value} = Primitive.cexpr! %q{ #{type}2NUM(#{value}) }"
      end
    end
    println

    # Define variables
    @values.each do |type, values|
      values.each do |value|
        println "  def C.#{value} = Primitive.cexpr!(%q{ #{type}2NUM(#{value}) })"
      end
    end
    println

    # Define function pointers
    @funcs.each do |func|
      println "  def C.#{func}"
      println "    Primitive.cexpr! %q{ SIZET2NUM((size_t)#{func}) }"
      println "  end"
      println
    end

    # Build a hash table for type lookup by name
    nodes_index = flatten_nodes(nodes).group_by(&:spelling).transform_values do |values|
      # Try to search a declaration with definitions
      node_with_children = values.find { |v| !v.children.empty? }
      next node_with_children if node_with_children

      # Otherwise, assume the last one is the main declaration
      values.last
    end

    # Define types
    @types.each do |type|
      unless definition = generate_node(nodes_index[type])
        raise "Failed to find or generate type: #{type}"
      end
      println "  def C.#{type}"
      println "@#{type} ||= #{definition}".gsub(/^/, "    ").chomp
      println "  end"
      println
    end

    # Define dynamic types
    @dynamic_types.each do |type|
      unless generate_node(nodes_index[type])&.start_with?('CType::Immediate')
        raise "Non-immediate type is given to dynamic_types: #{type}"
      end
      # Only one Primitive.cexpr! is allowed for each line: https://github.com/ruby/ruby/pull/9612
      println "  def C.#{type}"
      println "    @#{type} ||= CType::Immediate.find("
      println "      Primitive.cexpr!(\"SIZEOF(#{type})\"),"
      println "      Primitive.cexpr!(\"SIGNED_TYPE_P(#{type})\"),"
      println "    )"
      println "  end"
      println
    end

    # Leave a stub for types that are referenced but not targeted
    (@references - @types - @dynamic_types).each do |type|
      println "  def C.#{type}"
      println "    #{DEFAULTS[type]}"
      println "  end"
      println
    end

    print @postamble
  end

  private

  # Make an array that includes all top-level and nested nodes
  def flatten_nodes(nodes)
    result = []
    nodes.each do |node|
      unless node.children.empty?
        result.concat(flatten_nodes(node.children))
      end
    end
    result.concat(nodes) # prioritize top-level nodes
    result
  end

  # Return code before BINDGEN_BEG and code after BINDGEN_END
  def split_ambles(src_path)
    lines = File.read(src_path).lines

    preamble_end = lines.index { |l| l.include?(BINDGEN_BEG) }
    raise "`#{BINDGEN_BEG}` was not found in '#{src_path}'" if preamble_end.nil?

    postamble_beg = lines.index { |l| l.include?(BINDGEN_END) }
    raise "`#{BINDGEN_END}` was not found in '#{src_path}'" if postamble_beg.nil?
    raise "`#{BINDGEN_BEG}` was found after `#{BINDGEN_END}`" if preamble_end >= postamble_beg

    return lines[0..preamble_end].join, lines[postamble_beg..-1].join
  end

  # Generate code from a node. Used for constructing a complex nested node.
  # @param node [Node]
  def generate_node(node, sizeof_type: nil)
    case node&.kind
    when :struct, :union
      # node.spelling is often empty for union, but we'd like to give it a name when it has one.
      buf = +"CType::#{node.kind.to_s.sub(/\A[a-z]/, &:upcase)}.new(\n"
      buf << "  \"#{node.spelling}\", Primitive.cexpr!(\"SIZEOF(#{sizeof_type || node.type})\"),\n"
      bit_fields_end = node.children.index { |c| c.bitwidth == -1 } || node.children.size # first non-bit field index
      node.children.each_with_index do |child, i|
        skip_type = sizeof_type&.gsub(/\(\(struct ([^\)]+) \*\)NULL\)->/, '\1.') || node.spelling
        next if @skip_fields.fetch(skip_type, []).include?(child.spelling)
        field_builder = proc do |field, type|
          if node.kind == :struct
            to_ruby = @ruby_fields.fetch(node.spelling, []).include?(field)
            if child.bitwidth > 0
              if bit_fields_end <= i # give up offsetof calculation for non-leading bit fields
                raise "non-leading bit fields are not supported. consider including '#{field}' in skip_fields."
              end
              offsetof = node.offsetof.fetch(field)
            else
              off_type = sizeof_type || "(*((#{node.type} *)NULL))"
              offsetof = "Primitive.cexpr!(\"OFFSETOF(#{off_type}, #{field})\")"
            end
            "  #{field}: [#{type}, #{offsetof}#{', true' if to_ruby}],\n"
          else
            "  #{field}: #{type},\n"
          end
        end

        case child
        # BitField is struct-specific. So it must be handled here.
        in Node[kind: :field_decl, spelling:, bitwidth:, children: [_grandchild, *]] if bitwidth > 0
          buf << field_builder.call(spelling, "CType::BitField.new(#{bitwidth}, #{node.offsetof.fetch(spelling) % 8})")
        # "(unnamed ...)" struct and union are handled here, which are also struct-specific.
        in Node[kind: :field_decl, spelling:, type:, children: [grandchild]] if type.match?(/\((unnamed|anonymous) [^)]+\)\z/)
          if sizeof_type
            child_type = "#{sizeof_type}.#{child.spelling}"
          else
            child_type = "((#{node.type} *)NULL)->#{child.spelling}"
          end
          buf << field_builder.call(spelling, generate_node(grandchild, sizeof_type: child_type).gsub(/^/, '  ').sub(/\A +/, ''))
        # In most cases, we'd like to let generate_type handle the type unless it's "(unnamed ...)".
        in Node[kind: :field_decl, spelling:, type:] if !type.empty?
          buf << field_builder.call(spelling, generate_type(type))
        else # forward declarations are ignored
        end
      end
      buf << ")"
    when :typedef_decl
      case node.children
      in [child]
        generate_node(child)
      in [child, Node[kind: :integer_literal]]
        generate_node(child)
      in _ unless node.typedef_type.empty?
        generate_type(node.typedef_type)
      end
    when :enum_decl
      generate_type('int')
    when :type_ref
      generate_type(node.spelling)
    end
  end

  # Generate code from a type name. Used for resolving the name of a simple leaf node.
  # @param type [String]
  def generate_type(type)
    if type.match?(/\[\d+\]\z/)
      return "CType::Array.new { #{generate_type(type.sub!(/\[\d+\]\z/, ''))} }"
    end
    type = type.delete_suffix('const')
    if type.end_with?('*')
      if type == 'const void *'
        # `CType::Pointer.new { CType::Immediate.parse("void") }` is never useful,
        # so specially handle that case here.
        return 'CType::Immediate.parse("void *")'
      end
      return "CType::Pointer.new { #{generate_type(type.delete_suffix('*').rstrip)} }"
    end

    type = type.gsub(/((const|volatile) )+/, '').rstrip
    if type.start_with?(/(struct|union|enum) /)
      target = type.split(' ', 2).last
      push_target(target)
      "self.#{target}"
    else
      begin
        ctype = Fiddle::Importer.parse_ctype(type)
      rescue Fiddle::DLError
        push_target(type)
        "self.#{type}"
      else
        # Convert any function pointers to void* to workaround FILE* vs int*
        if ctype == Fiddle::TYPE_VOIDP
          "CType::Immediate.parse(\"void *\")"
        else
          "CType::Immediate.parse(#{type.dump})"
        end
      end
    end
  end

  def print(str)
    @src << str
  end

  def println(str = "")
    @src << str << "\n"
  end

  def chomp
    @src.delete_suffix!("\n")
  end

  def rstrip!
    @src.rstrip!
  end

  def push_target(target)
    unless target.match?(/\A\w+\z/)
      raise "invalid target: #{target}"
    end
    @references << target
  end
end

src_dir = File.expand_path('../..', __dir__)
src_path = File.join(src_dir, 'rjit_c.rb')
build_dir = File.expand_path(build_dir)
cflags = [
  src_dir,
  build_dir,
  File.join(src_dir, 'include'),
  File.join(build_dir, ".ext/include/#{RUBY_PLATFORM}"),
].map { |dir| "-I#{dir}" }

# Clear .cache/clangd created by the language server, which could break this bindgen
clangd_cache = File.join(src_dir, '.cache/clangd')
if Dir.exist?(clangd_cache)
  system('rm', '-rf', clangd_cache, exception: true)
end

# Parse rjit_c.h and generate rjit_c.rb
nodes = HeaderParser.new(File.join(src_dir, 'rjit_c.h'), cflags: cflags).parse
generator = BindingGenerator.new(
  src_path: src_path,
  consts: {
    LONG: %w[
      UNLIMITED_ARGUMENTS
      VM_ENV_DATA_INDEX_ME_CREF
      VM_ENV_DATA_INDEX_SPECVAL
    ],
    SIZET: %w[
      ARRAY_REDEFINED_OP_FLAG
      BOP_AND
      BOP_AREF
      BOP_EQ
      BOP_EQQ
      BOP_FREEZE
      BOP_GE
      BOP_GT
      BOP_LE
      BOP_LT
      BOP_MINUS
      BOP_MOD
      BOP_OR
      BOP_PLUS
      BUILTIN_ATTR_LEAF
      HASH_REDEFINED_OP_FLAG
      INTEGER_REDEFINED_OP_FLAG
      INVALID_SHAPE_ID
      METHOD_VISI_PRIVATE
      METHOD_VISI_PROTECTED
      METHOD_VISI_PUBLIC
      METHOD_VISI_UNDEF
      OBJ_TOO_COMPLEX_SHAPE_ID
      OPTIMIZED_METHOD_TYPE_BLOCK_CALL
      OPTIMIZED_METHOD_TYPE_CALL
      OPTIMIZED_METHOD_TYPE_SEND
      OPTIMIZED_METHOD_TYPE_STRUCT_AREF
      OPTIMIZED_METHOD_TYPE_STRUCT_ASET
      RARRAY_EMBED_FLAG
      RARRAY_EMBED_LEN_MASK
      RARRAY_EMBED_LEN_SHIFT
      RMODULE_IS_REFINEMENT
      ROBJECT_EMBED
      RSTRUCT_EMBED_LEN_MASK
      RUBY_EVENT_CLASS
      RUBY_EVENT_C_CALL
      RUBY_EVENT_C_RETURN
      RUBY_FIXNUM_FLAG
      RUBY_FLONUM_FLAG
      RUBY_FLONUM_MASK
      RUBY_IMMEDIATE_MASK
      RUBY_SPECIAL_SHIFT
      RUBY_SYMBOL_FLAG
      RUBY_T_ARRAY
      RUBY_T_CLASS
      RUBY_T_ICLASS
      RUBY_T_HASH
      RUBY_T_MASK
      RUBY_T_MODULE
      RUBY_T_STRING
      RUBY_T_SYMBOL
      RUBY_T_OBJECT
      SHAPE_FLAG_SHIFT
      SHAPE_FROZEN
      SHAPE_ID_NUM_BITS
      SHAPE_IVAR
      SHAPE_MASK
      SHAPE_ROOT
      STRING_REDEFINED_OP_FLAG
      T_OBJECT
      VM_BLOCK_HANDLER_NONE
      VM_CALL_ARGS_BLOCKARG
      VM_CALL_ARGS_SPLAT
      VM_CALL_FCALL
      VM_CALL_FORWARDING
      VM_CALL_KWARG
      VM_CALL_KW_SPLAT
      VM_CALL_KW_SPLAT_MUT
      VM_CALL_KW_SPLAT_bit
      VM_CALL_OPT_SEND
      VM_CALL_TAILCALL
      VM_CALL_TAILCALL_bit
      VM_CALL_ZSUPER
      VM_ENV_DATA_INDEX_FLAGS
      VM_ENV_DATA_SIZE
      VM_ENV_FLAG_LOCAL
      VM_ENV_FLAG_WB_REQUIRED
      VM_FRAME_FLAG_BMETHOD
      VM_FRAME_FLAG_CFRAME
      VM_FRAME_FLAG_CFRAME_KW
      VM_FRAME_FLAG_LAMBDA
      VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM
      VM_FRAME_MAGIC_BLOCK
      VM_FRAME_MAGIC_CFUNC
      VM_FRAME_MAGIC_METHOD
      VM_METHOD_TYPE_ALIAS
      VM_METHOD_TYPE_ATTRSET
      VM_METHOD_TYPE_BMETHOD
      VM_METHOD_TYPE_CFUNC
      VM_METHOD_TYPE_ISEQ
      VM_METHOD_TYPE_IVAR
      VM_METHOD_TYPE_MISSING
      VM_METHOD_TYPE_NOTIMPLEMENTED
      VM_METHOD_TYPE_OPTIMIZED
      VM_METHOD_TYPE_REFINED
      VM_METHOD_TYPE_UNDEF
      VM_METHOD_TYPE_ZSUPER
      VM_SPECIAL_OBJECT_VMCORE
      RUBY_ENCODING_MASK
      RUBY_FL_FREEZE
      RHASH_PASS_AS_KEYWORDS
    ],
  },
  values: {
    SIZET: %w[
      block_type_iseq
      imemo_iseq
      imemo_callinfo
      rb_block_param_proxy
      rb_cArray
      rb_cFalseClass
      rb_cFloat
      rb_cInteger
      rb_cNilClass
      rb_cString
      rb_cSymbol
      rb_cTrueClass
      rb_rjit_global_events
      rb_mRubyVMFrozenCore
      rb_vm_insns_count
      idRespond_to_missing
    ],
  },
  funcs: %w[
    rb_ary_entry_internal
    rb_ary_push
    rb_ary_resurrect
    rb_ary_store
    rb_ec_ary_new_from_values
    rb_ec_str_resurrect
    rb_ensure_iv_list_size
    rb_fix_aref
    rb_fix_div_fix
    rb_fix_mod_fix
    rb_fix_mul_fix
    rb_gc_writebarrier
    rb_get_symbol_id
    rb_hash_aref
    rb_hash_aset
    rb_hash_bulk_insert
    rb_hash_new
    rb_hash_new_with_size
    rb_hash_resurrect
    rb_ivar_get
    rb_obj_as_string_result
    rb_obj_is_kind_of
    rb_str_concat_literals
    rb_str_eql_internal
    rb_str_getbyte
    rb_vm_bh_to_procval
    rb_vm_concat_array
    rb_vm_defined
    rb_vm_get_ev_const
    rb_vm_getclassvariable
    rb_vm_ic_hit_p
    rb_vm_opt_newarray_min
    rb_vm_opt_newarray_max
    rb_vm_opt_newarray_hash
    rb_vm_opt_newarray_pack
    rb_vm_setinstancevariable
    rb_vm_splat_array
    rjit_full_cfunc_return
    rjit_optimized_call
    rjit_str_neq_internal
    rjit_record_exit_stack
    rb_ivar_defined
    rb_vm_throw
    rb_backref_get
    rb_reg_last_match
    rb_reg_match_pre
    rb_reg_match_post
    rb_reg_match_last
    rb_reg_nth_match
    rb_gvar_get
    rb_range_new
    rb_ary_tmp_new_from_values
    rb_reg_new_ary
    rb_ary_clear
    rb_str_intern
    rb_vm_setclassvariable
    rb_str_bytesize
    rjit_str_simple_append
    rb_str_buf_append
    rb_str_dup
    rb_vm_yield_with_cfunc
    rb_vm_set_ivar_id
    rb_ary_dup
    rjit_rb_ary_subseq_length
    rb_ary_unshift_m
    rjit_build_kwhash
    rb_rjit_entry_stub_hit
    rb_rjit_branch_stub_hit
    rb_sym_to_proc
  ],
  types: %w[
    CALL_DATA
    IC
    ID
    IVC
    RArray
    RB_BUILTIN
    RBasic
    RObject
    RStruct
    RString
    attr_index_t
    iseq_inline_constant_cache
    iseq_inline_constant_cache_entry
    iseq_inline_iv_cache_entry
    iseq_inline_storage_entry
    method_optimized_type
    rb_block
    rb_block_type
    rb_builtin_function
    rb_call_data
    rb_callable_method_entry_struct
    rb_callable_method_entry_t
    rb_callcache
    rb_callinfo
    rb_captured_block
    rb_cfunc_t
    rb_control_frame_t
    rb_cref_t
    rb_execution_context_struct
    rb_execution_context_t
    rb_iseq_constant_body
    rb_iseq_location_t
    rb_iseq_struct
    rb_iseq_t
    rb_method_attr_t
    rb_method_bmethod_t
    rb_method_cfunc_t
    rb_method_definition_struct
    rb_method_entry_t
    rb_method_iseq_t
    rb_method_optimized_t
    rb_method_type_t
    rb_proc_t
    rb_rjit_runtime_counters
    rb_serial_t
    rb_shape
    rb_shape_t
    rb_thread_struct
    rb_jit_func_t
    rb_iseq_param_keyword
    rb_rjit_options
    rb_callinfo_kwarg
  ],
  # #ifdef-dependent immediate types, which need Primitive.cexpr! for type detection
  dynamic_types: %w[
    VALUE
    shape_id_t
  ],
  skip_fields: {
    'rb_execution_context_struct.machine': %w[regs], # differs between macOS and Linux
    rb_execution_context_struct: %w[method_missing_reason], # non-leading bit fields not supported
    rb_iseq_constant_body: %w[jit_exception jit_exception_calls yjit_payload yjit_calls_at_interv], # conditionally defined
    rb_thread_struct: %w[status has_dedicated_nt to_kill abort_on_exception report_on_exception pending_interrupt_queue_checked],
    :'' => %w[is_from_method is_lambda is_isolated], # rb_proc_t
  },
  ruby_fields: {
    rb_iseq_constant_body: %w[
      rjit_blocks
    ],
    rb_iseq_location_struct: %w[
      base_label
      label
      pathobj
    ],
    rb_callable_method_entry_t: %w[
      defined_class
    ],
    rb_callable_method_entry_struct: %w[
      defined_class
    ],
  },
)
generator.generate(nodes)

# Write rjit_c.rb
File.write(src_path, generator.src)