Florian Frank wrote:
Robert Klemme wrote:
"Florian Frank" <flori@nixe.ping.de> schrieb im Newsbeitrag
news:42B8D1A3.4040005@nixe.ping.de...
Charles L. Snyder wrote:
I have a text-parsing question -
array1 = ["Joe lives in France", "Sue goes to Japan", "Hiro walked
around
France", "Bill moved to England"]
a_countries = ["France", "Japan", "England", "Spain"]
puts a_countries.inject({}) {|h,c| h[c] = array1.inject(0) { |s,a|
s + a.scan(c).size }; h }.sort_by { |_,v| -v }.map { |a|
"%10s%10u" % a }
If I'm not mistaken your variant iterates array1 a_countries.size
number of times. For large arrays this might be quite inefficient.
This is what I'd do:
counts = a_countries.inject({}){|h,c| h[c]=0; h}
array1.each do |text|
text.scan(/\w+/i) {|m| c = counts[m] and counts[m] = c+1}
end
But, but,... you used more than one line!? Isn't this against the
rules?
Well... I can squeeze it on one line.
You could even optimize away one hash lookups per found word by
doing:
class Counter
def initialize(n=0) @n=n end
def inc() @n+=1 end
def to_int() @n end
end
Without benchmarking (yeah) it's difficult to say, if this is really
faster.
counts = a_countries.inject({}){|h,c| h[c]=Counter.new; h}
array1.each do |text|
text.scan(/\w+/i) {|m| c = counts[m] and c.inc}
end
But I am sure calling the block for every WORD of a text and having a
hash lookup for each word, is quite inefficient for larger texts.
What about:
array1.inject(Hash.new(0)){|c,t|$re||=/\b(#{a_countries.join('|')})\b/;t.s
can($re){|x|c+=1};c}
Note especially, the one line.
Yes, I've thought of that, too. In fact, you can create a more
optimal regexp. Here's a simplified variant of what I usually use
(the more verbose version optimizes away unnecessary regexp
groupings):
class Trie
attr_reader :root
def initialize()
create = lambda {|h,k| h[k]=Hash.new(&create)}
@root = Hash.new(&create)
end
def add(str)
node = root
str.each_byte do |char|
node = node[char]
end
self
end
def to_rx() Regexp.new(to_rx_str) end
def to_rx_str(node = root)
node.map {|k,v| "" << k << "(?:" << to_rx_str(v) << ")" }.join "|"
end
end
array1 = ["Joe lives in France", "Sue goes to Japan", "Hiro walked
around France", "Bill moved to England"]
=> ["Joe lives in France", "Sue goes to Japan", "Hiro walked around
France", "Bill moved to England"]
a_countries = ["France", "Japan", "England", "Spain"]
=> ["France", "Japan", "England", "Spain"]
rx = a_countries.inject(Trie.new){|tr,c|tr.add c}.to_rx
=>
/S(?:p(?:a(?:i(?:n(?:)))))|J(?:a(?:p(?:a(?:n(?:)))))|E(?:n(?:g(?:l(?:a(?:n
(?:d(?:)))))))|F(?:r(?:a(?:n(?:c(?:e(?:))))))/
counts = array1.inject(Hash.new(0)){|h,text| text.scan(rx){|m|
h[m]+=1}; h}
=> {"France"=>2, "Japan"=>1, "England"=>1}
Kind regards
robert
Here's a more optimized version
class Trie
attr_reader :root
def initialize()
create = lambda {|h,k| h[k]=Hash.new(&create)}
@root = Hash.new(&create)
end
def add(str)
node = root
str.each_byte do |char|
node = node[char]
end
self
end
def to_rx(*rx_options) Regexp.new(to_rx_str, *rx_options) end
def to_rx_str(node = root)
case node.size
when 0
""
when 1
"" << node.keys[0] << to_rx_str(node.values[0])
else
sub_rx = node.map {|k,v| "" << k << to_rx_str(v) }.join( "|" )
if node.equal? root then sub_rx else "(?:" << sub_rx << ")" end
end
end
end
?> array1 = ["Joe lives in France", "Sue goes to Japan", "Hiro walked around France", "Bill moved to England"]
=> ["Joe lives in France", "Sue goes to Japan", "Hiro walked around France", "Bill moved to England"]
a_countries = ["France", "Japan", "England", "Spain", "Ecuador"]
=> ["France", "Japan", "England", "Spain", "Ecuador"]
rx = a_countries.inject(Trie.new){|tr,c|tr.add c}.to_rx
=> /Spain|Japan|E(?:cuador|ngland)|France/
Cheers
robert
···
Robert Klemme <bob.news@gmx.net> wrote: