I should warn that my solution below is rather long. I went down a
possibly more traditional route of writing a generic tokenizer/lexer.
I don't know if these are still commonly used but I couldn't find an
existing implementation in the Ruby Standard Library.
Back to the task...
There's a big list of test cases such as these in my unit tests (included).
The code follows; there are two files in total.
···
On 03/11/06, Ruby Quiz <james@grayproductions.net> wrote:
##################################################################
# = compiler_mw.rb - bytecode compiler
#
# Author:: Marcel Ward <wardies ^a-t^ gmaildotcom>
# Documentation:: Marcel Ward
# Last Modified:: Monday, 06 November 2006
require 'interp'
require 'lexer_mw'
module Compiler
# The lexer needs to know the character sets involved in deciding
# which state transition will be fired...
CHAR_SETS = {
:plus => [?+], :minus => [?-],
:digit => /\d/,
:div_mod => [?/, ?%], # matches '/' or '%'
:asterisk => [?*],
:open_paren => [?(], :close_paren => [?)]
}
# Tell the lexer how to parse a datastream: which tokens to
# generate, what state to switch to, etc.
# This table was designed according to my vague recollection of
# the dragon book on compiler construction by Aho/Sethi/Ullman.
STATE_TRANS_TABLE = {
:s_start => {
:plus => {:next_s_skip => :s_start},
:minus => {:next_s => :s_negate},
:digit => {:next_s => :s_numeric},
:open_paren => {:next_s => :s_start,
:token => :tok_open_paren}
},
:s_negate => {
:plus => {:next_s_skip => :s_negate},
:minus => {:next_s => :s_start},
:digit => {:next_s => :s_numeric},
:open_paren => {:next_s_backtrack => :s_start,
:token => :tok_negate}
},
:s_numeric => {
:plus => {:next_s_backtrack => :s_operator,
:token => :tok_int},
:minus => {:next_s_backtrack => :s_operator,
:token => :tok_int},
:digit => {:next_s => :s_numeric},
:div_mod => {:next_s_backtrack => :s_operator,
:token => :tok_int},
:asterisk => {:next_s_backtrack => :s_operator,
:token => :tok_int},
:close_paren => {:next_s_backtrack => :s_operator,
:token => :tok_int},
:eof => {:next_s_backtrack => :s_operator,
:token => :tok_int},
},
:s_operator => {
:plus => {:next_s => :s_start,
:token => :tok_add},
:minus => {:next_s => :s_start,
:token => :tok_subtract},
:div_mod => {:next_s => :s_start,
:token => :tok_div_mod},
:asterisk => {:next_s => :s_mult_or_power},
:close_paren => {:next_s => :s_operator,
:token => :tok_close_paren},
:eof => {} # when :next_s... is absent, finish
},
:s_mult_or_power => {
:plus => {:next_s_backtrack => :s_start,
:token => :tok_multiply},
:minus => {:next_s_backtrack => :s_start,
:token => :tok_multiply},
:digit => {:next_s_backtrack => :s_start,
:token => :tok_multiply},
:asterisk => {:next_s => :s_start,
:token => :tok_power},
:open_paren => {:next_s_backtrack => :s_start,
:token => :tok_multiply}
}
}
# Compiles a string expression _sum_ into bytecode and returns
# the bytecode array (as per Ruby Quiz 100 requirements).
def self.compile(sum)
lexer = LexerMW.new()
lexer.init_char_sets(CHAR_SETS)
lexer.init_state_transitions(STATE_TRANS_TABLE)
toks = lexer.tokenize(sum)
puts toks.inspect + "\n\n" + toks.map {|a,b| b}.join(' ') \
if $DEBUG == 1
# Get the mnemonic stack by parsing the tokens.
mnemonic_stack = parse(toks)
puts "\nParsed toks => #{mnemonic_stack.inspect}" if $DEBUG == 1
# Last stage now, we convert our internal mnemonics directly
# to a byte stack in the required bytecode format.
mnemonics_to_bytecode(mnemonic_stack)
end
MNEMONIC_TO_BYTECODE = {
:tok_add => Interpreter::Ops::ADD,
:tok_subtract => Interpreter::Ops::SUB,
:tok_multiply => Interpreter::Ops::MUL,
:tok_divide => Interpreter::Ops::DIV,
:tok_modulo => Interpreter::Ops::MOD,
:tok_power => Interpreter::Ops::POW
}
# This exception is raised by the mnemonic-to-bytecode method when
# an integer constant cannot be pushed onto the interpreter
# bytecode stack because it is too big to fit the
# Interpreter::Ops::LCONST instruction.
class OutOfRangeError < StandardError
end
# Convert our internal _mnemonics_ directly to a byte array and
# return this in the required bytecode format, ready to execute.
def self.mnemonics_to_bytecode(mnemonics)
bc =
mnemonics.each do
>mnem>
if MNEMONIC_TO_BYTECODE.has_key? mnem
bc << MNEMONIC_TO_BYTECODE[mnem]
else
# Try packing this value as a 2-or 4-byte signed string
# and ensure we get back the same value on unpacking it.
if [mnem] == [mnem].pack('s').unpack('s')
# 2-bytes will be enough
bc << Interpreter::Ops::CONST
bc.concat([mnem].pack('n').unpack('C*'))
elsif [mnem] == [mnem].pack('l').unpack('l')
# 4-bytes will be enough
bc << Interpreter::Ops::LCONST
bc.concat([mnem].pack('N').unpack('C*'))
else
# It could be dangerous to silently fail when a
# number will not fit in a 4-byte signed int.
raise OutOfRangeError
end
end
end
bc
end
# If there is a mismatch in the number of parenthesis, this
# exception is raised by the #parse routine.
# E.g. "3+(4-2" and "(3-10))" are both considered invalid.
class ParenthesisError < Exception
end
# The operator precedence hash helps the #parse method to
# decide when to store up operators and when to flush a load
# out. The
PAREN_PRECEDENCE = 0
OP_PRECEDENCE = {
:tok_end => -1,
:tok_open_paren => PAREN_PRECEDENCE,
:tok_close_paren => PAREN_PRECEDENCE,
:tok_add => 1, :tok_subtract => 1,
:tok_multiply => 2, :tok_div_mod => 2,
:tok_power => 3,
:tok_negate => 4
}
# Parse an array of [token,value] pairs as returned by
# LexerMW::tokenize. Returns our own internal quasi-bytecode
# mnemonic array.
def self.parse(tokens)
operator_stack =
ops =
# Push the bottom-most element with precedence equivalent to that
# of :tok_end so when we see :tok_end all pending operation
# tokens on the stack get popped
precedence_stack = [OP_PRECEDENCE[:tok_end]]
tokens.each do
>tok, val|
if tok == :tok_int
# "--3".to_i => 0 is bad, so use eval("--3") => 3 instead.
ops << eval(val)
else
precedence = OP_PRECEDENCE[tok]
if not tok == :tok_open_paren
while precedence <= precedence_stack.last &&
precedence_stack.last > PAREN_PRECEDENCE
# Workaround for the fact that the ** power operation
# is calculated Right-to-left,
# i.e. 2**3**4 == 2**(3**4) /= (2**3)**4
break if tok == :tok_power &&
precedence_stack.last == OP_PRECEDENCE[:tok_power]
precedence_stack.pop
ops << operator_stack.pop
end
end
# Divide and modulo come out of the lexer as the same token
# so override tok according to its corresponding value
tok == :tok_div_mod && \
tok = (val == '/') ? :tok_divide : :tok_modulo
case tok
when :tok_close_paren
precedence_stack.pop == PAREN_PRECEDENCE \
or raise ParenthesisError
when :tok_negate
# val contains just the minuses ('-', '--', '---', etc.)
# Optimise out (x) === --(x) === ----(x), etc.
if val.size % 2 == 1
# No negate function for -(x) so simulate using 0 - (x)
precedence_stack.push precedence
operator_stack.push :tok_subtract
ops << 0
end
when :tok_end
raise ParenthesisError if precedence_stack.size != 1
else
precedence_stack.push precedence
operator_stack.push tok unless tok == :tok_open_paren
end
end
end
ops
end
end
if $0 == __FILE__
eval DATA.read, nil, $0, __LINE__+4
end
__END__
require 'test/unit'
class TC_Compiler < Test::Unit::TestCase
def test_simple
@test_data = [
'8', '124', '32767', # +ve CONST
'-1', '-545', '-32768', # -ve CONST
'32768', '294833', '13298833', # +ve LCONST
'-32769', '-429433', '-24892810', # -ve LCONST
'4+5', '7-3', '30+40+50', '14-52-125', # ADD, SUB
'512243+1877324', '40394-12388423', # LCONST, ADD, SUB
'3*6', '-42*-90', '94332*119939', # MUL
'8/3', '-35/-15', '593823/44549', # DIV
'8%3', '243%-59', '53%28%9', # MOD
'531%-81%14', '849923%59422', #
'-2147483648--2147483648', # SUB -ve LCONST
'2**14', '-4**13+2' # POW
]
@test_data.each do
>sum>
assert_equal [eval(sum)],
Interpreter.new(Compiler.compile(sum)).run,
"whilst calculating '#{sum}'"
end
end
def test_advanced
@test_data = [
'-(423)', '-(-523*32)', '---0',
'-(-(-(16**--++2)))',
'3**(9%5-1)/3+1235349%319883+24*-3',
'+42', '((2*-4-15/3)%16)', '4**3**((2*-4-15/3)%16)',
'64**-(-(-3+5)**3**2)', '4*165%41*341/7/2/15%15%13',
'--(---(4**3**((2*-4-15/3)%16))+++-410--4)'
]
@test_data.each do
>sum>
assert_equal [eval(sum)],
Interpreter.new(Compiler.compile(sum)).run,
"whilst calculating '#{sum}'"
end
end
end
#!/usr/bin/env ruby
##################################################################
# = lexer_mw.rb - generic lexical analyser
#
# Author:: Marcel Ward <wardies ^a-t^ gmaildotcom>
# Documentation:: Marcel Ward
# Last Modified:: Monday, 06 November 2006
#
# Solution for Ruby Quiz number 100 - http://www.rubyquiz.com/
$DEBUG = 0
# If the lexer fails to find an appropriate entry in the state
# transition table for the current character and state, it
# raises this exception.
class LexerFailure < StandardError
end
# If the lexer encounters a character for which no matching charset
# has been supplied then it raises this exception.
#
# This exception will never be raised if #init_state_transitions
# has been called with an appropriate catch-all charset id.
class InvalidLexeme < StandardError
end
class LexerMW
# Creates an instance of the lexer class.
#
# _lexer_eof_ascii_::
# defines the ASCII byte value that the lexer considers as
# end-of-file when it is encountered. When #tokenize is called,
# the supplied datastream is automatically appended with this
# character.
def initialize(lexer_eof_ascii = 0)
@s_trans = {}
@columns = {}
@lex_eof = lexer_eof_ascii
end
# Initialize the character set columns to be used by the lexer.
#
# _cs_defs_::
# a hash containing entries of the form <tt>id => match</tt>,
# where _match_ defines the characters to be matched and _id_
# is the id that will be passed to the finite state machine
# to inidicate the character grouping encountered.
# _eof_charset_id_::
# defines the character set identifier which the lexer will
# attempt to match in the state machine table when the
# end-of-file character defined in #new is encountered.
#
# The content of _match_ falls into one of two main categories:
#
# _regexp_:: e.g. <tt>/\d/</tt> will match any digit 0..9; or
# _enum_:: an enumeration that describes the set of allowed
# character byte values, e.g.
# the array <tt>[?*, ?/, ?%]</tt> matches
# <b>*</b>, <b>/</b> or <b>%</b>, while the range
# <tt>(?a..?z)</tt> matches lowercase alphas.
#
# e.g.
#
# init_char_sets({
# :alphanum => /[A-Z0-9]/,
# :underscore => [?_],
# :lower_vowel => [?a, ?e, ?i, ?o, ?u],
# :special => (0..31)
# },
# :end_line)
#
# It is the responsibility of the caller to ensure that the
# match sets for each column are mutually exclusive.
#
# If a 'catch-all' set is needed then it is not necessary
# to build the set of all characters not already matched.
# Instead, see #init_state_transitions parameter list.
#
# Note, the contents of the hash is duplicated and stored
# internally to avoid any inadvertent corruption from outside.
def init_char_sets(cs_defs, eof_charset_id = :eof)
@charsets = {}
# Make a verbatim copy of the lexer charset columns
cs_defs.each_pair do
>charset_id, match|
@charsets[charset_id] = match.dup # works for array/regexp
end
# Add an end-of-file charset column for free
@charsets[eof_charset_id] = [@lex_eof]
puts "@charsets =\n#{@charsets.inspect}\n\n" if $DEBUG == 1
end
# Initialize the state transition table that will be used by the
# finite state machine to convert incoming characters to tokens.
#
# _st_::
# a hash that defines the state transition table to be used
# (see below).
# _start_state_::
# defines the starting state for the finite state machine.
# _catch_all_charset_id_::
# defines an optional charset id to be tried if the character
# currently being analysed matches none of the charsets
# in the charset table. The default +nil+ ensures that the
# InvalidLexeme exception is raised if no charsets match.
#
# The state transition table hash _st_ maps each valid original
# state to a hash containing the _rules_ to match when in that
# state.
#
# Each hash entry _rule_ maps one of the character set ids
# (defined in the call to #init_char_sets) to the _actions_ to be
# carried out if the current character being analysed by the lexer
# matches.
#
# The _action_ is a hash of distinct actions to be carried out for
# a match. The following actions are supported:
#
# <tt>:next_s => _state_</tt>::
# sets the finite state machine next state to be _state_ and
# appends the current character to the lexeme string being
# prepared, absorbing the current character in the datastream.
#
# <tt>:next_s_skip => _state_</tt>::
# as above but the lexeme string being prepared remains static.
#
# <tt>:next_s_backtrack => _state_</tt>::
# as for _next_s_skip_ above but does not absorb the current
# character (it will be used for the next state test).
#
# <tt>:token => _tok_</tt>::
# appends a hash containing a single entry to the array of
# generated tokens, using _tok_ as the key and a copy of the
# prepared lexeme string as the value.
#
# When the end of the datastream is reached, the lexer looks for
# a match against charset <tt>:eof</tt>.
#
# When the performed actions contain no +next_s+... action, the
# lexer assumes that a final state has been reached and returns
# the accumulated array of tokens up to that point.
#
# e.g.
#
# init_state_transitions({
# :s1 => {:alpha => {next_s = :s2},
# :period => {:token => :tok_period}},
# :s2 => {:alphanum => {next_s = :s2},
# :underscore => {next_s_skip == :s2},
# :period => {next_s_backtrack = :s1}
# :eof => {}}, // final state, return tokens
# }, :s1, :other_chars)
#
# Note, the contents of the hash is duplicated and stored
# internally to avoid any inadvertent corruption from outside.
def init_state_transitions(st, start_state = :s_start,
catch_all_charset_id = nil)
@start_state = start_state
@others_key = catch_all_charset_id
@s_trans = {}
# Make a verbatim copy of the state transition table
st.each_pair do
>orig_state, lexer_rules|
@s_trans[orig_state] = state_rules = {}
lexer_rules.each_pair do
>lexer_charset, lexer_actions|
state_rules[lexer_charset] = cur_actions = {}
lexer_actions.each_pair do
>action, new_val|
cur_actions[action] = new_val
end
end
end
puts "@s_trans =\n#{@s_trans.inspect}\n\n" if $DEBUG == 1
end
# Tokenize the datastream in _str_ according to the specific
# character set and state transition table initialized through
# #init_char_sets and #init_state_transitions.
#
# Returns an array of token elements where each element is
# a pair of the form:
#
# [:token_name, "extracted lexeme string"]
#
# The end token marker [:tok_end, nil] is appended to the end
# of the result on success, e.g.
#
# tokenize(str)
# # => [[:tok_a, "123"], [:tok_b, "abc"], [:tok_end, nil]]
#
# Raises the LexerFailure exception if no matching state
# transition is found for the current state and character.
def tokenize(str)
state = @start_state
lexeme = ''
tokens =
# Append our end of file marker to the string to be tokenized
str += "%c" % @lex_eof
str.each_byte do
>char>
char_as_str = "%c" % char
loop do
match = @charsets.find {
>id, match|
(match.kind_of? Regexp) ? \
(match =~ char_as_str) : (match.include? char)
} || [@others_key, @charsets[@others_key]] or \
raise InvalidLexeme
# Look for the action matching our current state and the
# character set id for our current char.
action = @s_trans[state][match.first] or raise LexerFailure
# If found, action contains our hash of actions, e.g.
# {:next_s_backtrack => :s_operator, :token => :tok_int}
puts "#{char==@lex_eof?'<eof>':char_as_str}: " \
"#{state.inspect} - #{action.inspect}" if $DEBUG == 1
# Build up the lexeme unless we're backtracking or skipping
lexeme << char_as_str if action.has_key? :next_s
tokens << [action[:token], lexeme.dup] && lexeme = '' if \
action.has_key? :token
# Set the next state, or - when there is no specified next
# state - we've finished, so return the tokens.
state = action[:next_s] || action[:next_s_skip] ||
action[:next_s_backtrack] or
return tokens << [:tok_end, nil]
break unless action.has_key? :next_s_backtrack
end
end
tokens
end
end
if $0 == __FILE__
eval DATA.read, nil, $0, __LINE__+4
end
__END__
require 'test/unit'
class TC_LexerMW < Test::Unit::TestCase
def test_simple
@lexer = LexerMW.new()
@char_sets = {
:letter => (?a..?z),
:digit => (/\d/),
:space => [?\s, ?_]
}
@lexer.init_char_sets(@char_sets)
@st = {
:extract_chars => {
:letter => {:next_s => :extract_chars},
:digit => {:next_s => :extract_chars},
:space => {:next_s_skip => :extract_chars,
:token => :tok_text},
:eof => {:token => :tok_text}
},
:extract_alpha => {
:letter => {:next_s => :extract_alpha},
:digit => {:next_s_backtrack => :extract_num,
:token => :tok_alpha},
:space => {:next_s_skip => :extract_alpha,
:token => :tok_alpha},
:other => {:next_s_skip => :extract_alpha},
:eof_exit => {}
},
:extract_num => {
:letter => {:next_s_backtrack => :extract_alpha,
:token => :tok_num},
:digit => {:next_s => :extract_num},
:space => {:next_s_skip => :extract_num},
:others => {:next_s_skip => :extract_alpha,
:token => :tok_num}
}
}
@lexer.init_state_transitions(@st, :extract_chars)
assert_equal [
[:tok_text, "123"], [:tok_text, "45"],
[:tok_text, "6"], [:tok_text, "78"],
[:tok_text, "abcd"], [:tok_text, "efghi"],
[:tok_text, "jklmn"], [:tok_end, nil]
], @lexer.tokenize("123 45 6_78 abcd efghi_jklmn")
@lexer = LexerMW.new(?$)
@lexer.init_char_sets(@char_sets, :eof_exit)
@lexer.init_state_transitions(@st, :extract_num, :others)
assert_equal [
[:tok_num, "12345678"], [:tok_alpha, "abcd"],
[:tok_alpha, "efghi"], [:tok_num, "445"],
[:tok_alpha, ""], [:tok_num, "1222"], [:tok_end, nil]
], @lexer.tokenize("123 45 6_78 abcd efghi445!12_22!ab$45")
end
end