Ruby1.9.0のバイトコードをいじり倒す(その9)

「まつもと直伝 プログラミングのオキテ 第18回 プログラムを高速化する(その2)」(http://itpro.nikkeibp.co.jp/article/COLUMN/20080125/292086/?P=3&ST=lin-os)を読んでいて、バイトコードパッチを使ってプロファイラを作るとどれくらいパフォーマンスが出るんだろうと思ったので、作ってみました。

記事中のオリジナルのマンデルブロ集合計算プログラムをプロファイラありと無しで比べてみました。

無しの場合

$ time /usr/local/bin/ruby.exe mandel.rb >/dev/null

real    0m1.197s
user    0m1.186s
sys     0m0.015s

有りの場合

$ time /usr/local/bin/ruby.exe debug/prof.rb mandel.rb >/dev/null

  %   cumulative   self              self     total
 time   seconds   seconds    calls  ms/call  ms/call  name
 53.27    11.83     11.83     3750     3.15     5.75  Object#mandelbrot
  9.01    13.83      2.00   117551     0.02     0.02  Complex#abs
  8.34    15.68      1.85   114769     0.02     0.02  Complex#*
  8.33    17.53      1.85   114769     0.02     0.02  Complex#+
  6.68    19.01      1.48   118569     0.01     0.01  Fixnum#<
  6.42    20.44      1.43   121301     0.01     0.01  Float#<
  4.78    21.50      1.06   114819     0.01     0.01  Fixnum#+
  2.20    21.99      0.49        1   488.00 22188.00  Object#mandel_calc
  0.49    22.09      0.11     7500     0.01     0.01  Class#new
  0.21    22.14      0.05     3700     0.01     0.01  Float#+
  0.14    22.17      0.03     3751     0.01     0.01  Fixnum#-
  0.07    22.19      0.02     3750     0.00     0.00  Object#printf
  0.00    22.19      0.00       50     0.00     0.00  Float#>
  0.00    22.19      0.00      100     0.00     0.00  Object#print
  0.00    22.19      0.00       49     0.00     0.00  Float#-
  0.00    22.19      0.00        1     0.00     0.00  Fixnum#>
  0.00    22.20      0.00        1     0.00 22203.00  #toplevel

real    0m23.243s
user    0m22.311s
sys     0m0.905s

約20分の1の速度低下です。オリジナルのprofile.rbよりは速いですが...。ruby-profすごいです。

あと、complrex.rbのメソッドはパッチを当ててprofileするとコアをはいてしまうので、complex.rbのメソッドはprofileの対象になっていないです。

ソースコード(prof.rb/instruction.rb)です。実行するためにはrubyにパッチを当てる必要があります(http://d.hatena.ne.jp/miura1729/20080101/1199154900)。

内容は標準ライブラリのprofiler.rbをほぼそのまま使っています。set_trace_funcの代わりにバイトコードパッチを使っています。

handle_call/handle_returnがset_trace_funcのハンドラに相当します。これらを書き直せば、もうちょっとスピードアップするんじゃないかなと考えています。

#!/usr/local/bin/ruby -w

require 'instruction'
require 'pp'

# Profiler by bytecode patch
module PatchProfiler
  include VMLib
  @@start = @@stack = @@map = nil
  @@objstk = nil
  @@loaded_file = nil

  module_function

  def handle_call(obj)
    now = Process.times[0]
    @@objstk.push obj
    @@stack.push [now, 0.0]
  end

  def handle_return(method)
#    print "#{obj}:#{method}\n"
    obj = @@objstk.pop
    now = Process.times[0]
    key = [obj.class, method]
    if tick = @@stack.pop
      data = (@@map[key] ||= [0, 0.0, 0.0, key])
      data[0] += 1
      cost = now - tick[0]
      data[1] += cost
      data[2] += cost - tick[1]
      @@stack[-1][1] += cost if @@stack[-1]
    end
  end

  def handle_require(fn)
    if @@loaded_file.include?(fn) then
      return
    end

    @@loaded_file.push fn
    path = require_extend(fn)
    if path == nil or true then
      require fn
    else
      iseq = VM::InstructionSequence.compile_file(path)
      iseqt = InstSeqTree.new(nil, iseq)
      iseqt = patch_for_profile(iseqt)
#      print VM::InstructionSequence.load(iseqt.to_a).disasm
      VM::InstructionSequence.load(iseqt.to_a).eval
    end
  end

  def require_extend(fname)
    $:.each do |dir|
      rfname = dir + "/" + fname
      if FileTest.file?(rfname) then
        return rfname
      end
      
      rfname = dir + "/" + fname + ".rb"
      if FileTest.file?(rfname) then
        return rfname
      end
    end

    return nil
  end

  def start_profile
    @@start = Process.times[0]
    @@stack = []
    @@map = {}
    @@loaded_file = []
    @@objstk = []

    iseq = VM::InstructionSequence.compile_file(ARGV[0])
    ARGV.shift
    iseqt = InstSeqTree.new(nil, iseq)
    iseqt = patch_for_profile(iseqt)
#    print VM::InstructionSequence.load(iseqt.to_a).disasm
    VM::InstructionSequence.load(iseqt.to_a).eval
  end
      
  def print_profile(f)
    total = Process.times[0] - @@start
    if total == 0 then total = 0.01 end
    data = @@map.values
    data = data.sort_by{|x| -x[2]}
    sum = 0
    f.printf "  %%   cumulative   self              self     total\n"
    f.printf " time   seconds   seconds    calls  ms/call  ms/call  name\n"
    for d in data
      sum += d[2]
      f.printf "%6.2f %8.2f  %8.2f %8d ", d[2]/total*100, sum, d[2], d[0]
      f.printf "%8.2f %8.2f  %s\n", d[2]*1000/d[0], d[1]*1000/d[0], get_name(*d[3])
    end
    f.printf "%6.2f %8.2f  %8.2f %8d ", 0.0, total, 0.0, 1     # ???
    f.printf "%8.2f %8.2f  %s\n", 0.0, total*1000, "#toplevel" # ???
  end
      
  def get_name(klass, id)
    name = klass.to_s || ""
    if klass.kind_of? Class
      name += "#"
    else
      name += "."
    end
    name + id.id2name
  end
  private :get_name

  def patch_for_profile(iseqt)
    iseqt.add_code_all_around_send {|fname, info, no, code|
      lambda {|mname, sendno, inst|
        case mname
        when :require
          [
            [:swap],
            [:pop],
            [:putnil],
            [:getconstant, :PatchProfiler],
            [:swap],
            [:send, :handle_require, 1, nil, 0, nil],
          ]
          
        else
          # sendの第2引数は引数の数
          recpos = inst[0][2] + 1

          # sendの第4引数はフラグでbit4が1だとレシーバはself
          sendflg = inst[0][4]

          if sendflg & 8 == 0 then
            receiver = [:topn, recpos]
          else
            receiver = [:putself]
          end

          [
            [:putnil],
            [:getconstant, :PatchProfiler],
            receiver,
            [:send, :handle_call, 1, nil, 0, nil],
            [:pop]
          ] + inst +
            [
            [:putnil],
            [:getconstant, :PatchProfiler],
            [:putobject, mname],
            [:send, :handle_return, 1, nil, 0, nil],
            [:pop]
          ]
        end
      }
    }

    iseqt
  end
  private :patch_for_profile
end

VM::InstructionSequence.compile_option = {
  :specialized_instruction => false
}

PatchProfiler::start_profile
PatchProfiler::print_profile(STDERR)

実行にはあとinstruction.rbが必要です。以下に示します。

#
#  instruction.rb - structured bytecode library
#
#
module VMLib
  class InstSeqTree
    Headers = %w(magic major_version minor_version format_type
                 misc name filename line type locals args exception_table)
#                 misc name filename line type locals args exception_table)
#  call-seq:
#     VMLib::InstSeqTree.new(parent, iseq)
#        parent  Partent of InstSeqTree
#                For example, when you will construct InstSeqTree of
#                the method body, you must 'parent' is InstSeqTree of definition
#                code of the method.
#                If parent is none, 'parent' is nil.
#        iseq    Instruction Sequence, Normally the result of 
#                VM::InstructionSequence.compile(...) or 
#                VM::InstructionSequence.compile_file(...)
    def initialize(parent = nil, iseq = nil)
      @klasses = {}
      @methodes = {}
      @blockes = {}
      @line = {}
      @line[nil] = []
      @line_list = []
      
      @header = {}
      @body = nil
      @parent = parent
      @cur_send_no = 0

      Headers.each do |name|
        @header[name] = nil
      end
      
      if iseq then
        init_from_ary(iseq.to_a)
      end
    end

    attr :header
    attr :klasses
    attr :methodes
    attr :blockes
    attr :line
    attr :line_list
    attr :body
    attr :parent
    
    def init_from_ary(ary)
      i = 0
      Headers.each do |name|
        @header[name] = ary[i]
        i = i + 1
      end

      @body = ary[i]
      curlinno = nil
      @body.each do |inst|
        if inst.is_a? Integer then
          # Line number
          curlinno = "#{inst}"
          i = 1
          while @line_list.include?(curlinno) do
            curlinno = "#{inst}-#{i}"
            i = i + 1
          end
          @line_list.push curlinno
          @line[curlinno] = []

        elsif inst.is_a? Array
          case inst[0]
          when :defineclass
            if inst[2] then
              obj = InstSeqTree.new(self)
              obj.init_from_ary(inst[2])
              @klasses[inst[1]] = obj
            end
            
          when :definemethod
            if inst[2] then
              obj = InstSeqTree.new(self)
              obj.init_from_ary(inst[2])
              @methodes[inst[1]] = obj
            end
            
          # inst[3]にはブロック内のメソッドの通し番号が入る
          # この通し番号は例外処理でreturn pointの確定などに
          # 使われる。
          when :send
            if inst[3] then
              obj = InstSeqTree.new(self)
              obj.init_from_ary(inst[3])
              @blockes[@cur_send_no]= obj
              inst[3] = [inst[3], @cur_send_no]
            else
              inst[3] = [nil, @cur_send_no]
            end
            @cur_send_no += 1
            
          when :invokesuper
            if inst[2] then
              obj = InstSeqTree.new(self)
              obj.init_from_ary(inst[2])
              @blockes[@cur_send_no] = obj
              inst[2] = [inst[3], @cur_send_no]
            else
              inst[2] = [nil, @cur_send_no]
            end
            @cur_send_no += 1
          end

          @line[curlinno].push inst
          
        elsif inst.is_a? Symbol
          # Label
          if !@line_list.include?(curlinno) then
            @line_list.push curlinno
          end
          @line[curlinno].push inst
            
        else
          raise inst
        end
      end
    end

    def to_a
      res = []
      Headers.each do |name|
        res.push @header[name]
      end
      
      body = []
      @line_list.each do |ln|
        body.push ln.to_i
        @line[ln].each do |inst|
          if inst.is_a? Array then
            cinst = inst.clone
            case inst[0]
            when :defineclass
              if inst[2] then
                cinst[2] = @klasses[inst[1]].to_a
              end
              
            when :definemethod
              if inst[2] then
                cinst[2] = @methodes[inst[1]].to_a
              end
              
            when :send
              if inst[3] and inst[3][0] then
                cinst[3] = @blockes[inst[3][1]].to_a
              else
                cinst[3] = nil
              end
              
            when :invokesuper
              if inst[2] and inst[2][0] then
                cinst[2] = @blockes[inst[2][1]].to_a
              else
                cinst[2] = nil
              end
            end
            body.push cinst
          else
            body.push inst
          end
        end
      end
      
      res.push body
      res
    end

    def add_code_all_before_block(&action)
      traverse_code([nil, nil, nil]) do |code, info|
        if code.line_list[0] != :before then
          code.line_list.unshift :before
          code.line[:before] = []
        end
        
        curlno = 0
        code.line_list.sort_by {|item| item.to_i}.each do |n|
          if n.to_i != 0 then
            curlno = n
            break
          end
        end
        line = code.line
        inscode = action.call(header['filename'], info, curlno, code)
        line[:before] =  insert_code(inscode, line[:before], 0)
      end
    end

    # 命令の最後にコードを付加するのではなく、:leave命令の直前に
    # 命令を付加しているのに注意
    def add_code_all_before_return(&action)
      traverse_code([nil, nil, nil]) do |code, info|
        line = code.line
        code.line.each do |no, cont|
          insact = action.call(header['filename'], info, no, code)
          line[no] = insert_code_before_return(insact, line[no], code)
        end
      end
    end

    def add_code_all_around_send(&action)
      traverse_code([nil, nil, nil]) do |code, info|
        line = code.line
        code.line.each do |no, cont|
          insact = action.call(header['filename'], info, no, code)
          line[no] = insert_code_around_send(insact, line[no], code)
        end
      end
    end

    def add_code_all_before_line(&action)
      traverse_code([nil, nil, nil]) do |code, info|
        line = code.line
        code.line.each do |no, cont|
          inscode = action.call(header['filename'], info, no, code)
          line[no] = insert_code(inscode, line[no], 0)
        end
      end
    end

    def traverse_code(info, &action)
      action.call(self, info)
      
      blockes.each do |sno, cont|
        cont.traverse_code([info[0], info[1], sno], &action)
      end

      klasses.each do |name, cont|
        cont.traverse_code([name, nil, nil], &action)
      end
      
      methodes.each do |name, cont|
        cont.traverse_code([info[0], name, nil], &action)
      end
    end


    private

    def join(*arg)
      res = []
      arg.each do |n|
        if n then
          res += n
        end
      end
      
      res
    end

    # ラベルと:getinlinecache命令の間に命令を挿入するとコア吐くので
    # それを防ぐ
    # insposは挿入地点で0が先頭
    def insert_code(code, line, inspos)
      if line[inspos].is_a? Array and line[inspos][0] == :getinlinecache then
        join(line[0, inspos + 1], code, line[inspos+1..-1])
      else
        join(line[0, inspos], code, line[inspos..-1])
      end
    end

    # sendの前後にコードを挿入する
    # スタックフレームの作成に使う
    # insact: 実行するとsendの前、後の2つのコードを返す
    # sendに続くラベルはbreakの例外処理で使うから、sendとくっつけて
    # 1つの命令として扱う
    # sendとラベルの間には命令は挟まない。さもないと、localjumperrorが出る。
    def insert_code_around_send(insact, line, ctree)
      res = []
      issend = false
      line.each_with_index do |inst, i|
        if issend and inst.is_a? Symbol then
          issend = false
          next
        end
        issend = false
        sendno = nil
        mname = nil

        if inst.is_a? Array then
          if inst[0] == :send then
            issend = true
            sendno = inst[3][1]
            mname = inst[1]
          elsif inst[0] == :sendsuper then
            issend = true
            sendno = inst[2][1]
            mname = inst[1]
          end
        end
        
        if issend then
          if line[i+1].is_a? Symbol then
            insts = [inst, line[i + 1]]
          else
            insts = [inst]
          end
          res = res + insact.call(mname, sendno, insts)
        else
          res.push inst
        end
      end
      res
    end

    TAG_RETURN = 1
    TAG_BREAK = 2
    def insert_code_before_return(insact, line, ctree)
      line.inject([]) do |res, inst|
        if inst.is_a? Array then
          if inst[0] == :leave then
            res = res + insact.call(1)
          elsif inst[0] == :throw then
            if inst[1] == TAG_RETURN then
              # return 
              # この場合は一番外側のmethodまでたどってそこにreturn
              itree = ctree
              n = 1
              while itree do
                if itree.header['type'] == :method then
                  break
                end
#                p itree.header['type']
                n += 1
                itree = itree.parent
              end
              
              if itree then
                res = res + insact.call(n)
              end
            elsif inst[1] == TAG_BREAK then
              # break
              # この場合はブロックにreturnする
              res = res + insact.call(1)
            end
          end
        end
        res.push inst
        
        res
      end
    end
  end
end