Hello all,
Here is an attempt at a naive fuzzy string matching algorithm. I would
appreciate comments on the code - which is not as simple as I would
like.
I have a couple of speed improvements in mind:
1) replace EQUAL hash with array (possibly NArray).
2) a path can be terminated before fully explored if it can be
calculated that it will never complete (somewhere inside in_bounds?).
Cheers,
Martin
#!/usr/bin/env ruby
require 'pp'
VERBOSE = false
TEST = true
# IUPAC nucleotide pair ambiguity equivalents
EQUAL = {
:AA => true, :BU => true, :TH => true, :UY => true,
:TT => true, :CB => true, :UH => true, :SC => true,
:CC => true, :GB => true, :VA => true, :SG => true,
:GG => true, :TB => true, :VC => true, :CS => true,
:UU => true, :UB => true, :VG => true, :GS => true,
:NA => true, :DA => true, :AV => true, :WA => true,
:NT => true, :DG => true, :CV => true, :WT => true,
:NC => true, :DT => true, :GV => true, :WU => true,
:NG => true, :DU => true, :KG => true, :AW => true,
:NU => true, :AD => true, :KT => true, :TW => true,
:AN => true, :GD => true, :KU => true, :UW => true,
:TN => true, :TD => true, :GK => true, :RA => true,
:CN => true, :UD => true, :TK => true, :RG => true,
:GN => true, :HA => true, :UK => true, :AR => true,
:UN => true, :HC => true, :YC => true, :GR => true,
:NN => true, :HT => true, :YT => true, :MA => true,
:BC => true, :HU => true, :YU => true, :MC => true,
:BG => true, :AH => true, :CY => true, :AM => true,
:BT => true, :CH => true, :TY => true, :CM => true,
}
# Class to match two nucleotide sequences allowing for ambiguity codes
# and a given maximum number of matches, insertions, and deletions.
class Matcher
attr_accessor :seq1_index, :seq2_index, :matches, :mismatches,
:insertions, :deletions
def initialize(seq1, seq2, max_mismatches = 0, max_insertions = 0,
max_deletions = 0, seq1_index = 0, seq2_index = 0, matches = 0,
mismatches = 0, insertions = 0, deletions = 0)
@seq1 = seq1
@seq2 = seq2
@max_mismatches = max_mismatches
@max_insertions = max_insertions
@max_deletions = max_deletions
@seq1_index = seq1_index
@seq2_index = seq2_index
@matches = matches
@mismatches = mismatches
@insertions = insertions
@deletions = deletions
end
# Method to explore all paths for matching two sequences allowing for
# a given maximum number of mismatches, insertions and deletions.
def match
paths = []
paths << self
while not paths.empty?
pp paths if VERBOSE
new_paths = []
paths.each do |path|
if path.match? # ---- Match ----
path_matches = path.dup
path_matches.matches += 1
return path_matches if path_matches.ok?
path_matches.seq1_index += 1
path_matches.seq2_index += 1
new_paths << path_matches if path_matches.in_bounds?
else # ---- Mismatch
···
----
path_mismatches = path.dup
path_mismatches.mismatches += 1
return path_mismatches if path_mismatches.ok?
path_mismatches.seq1_index += 1
path_mismatches.seq2_index += 1
new_paths << path_mismatches if path_mismatches.in_bounds?
end
if path.insertions < @max_insertions # ---- Insertion
----
path_insertions = path.dup
path_insertions.insertions += 1
return path_insertions if path_insertions.ok?
path_insertions.seq1_index += 1
new_paths << path_insertions if path_insertions.in_bounds?
end
if path.deletions < @max_deletions # ---- Deletion
----
path_deletions = path.dup
path_deletions.deletions += 1
return path_deletions if path_deletions.ok?
path_deletions.seq2_index += 1
new_paths << path_deletions if path_deletions.in_bounds?
end
end
paths = new_paths
end
end
# Method to check if residues match.
def match?
if self.seq1_index < @seq1.length and self.seq2_index < @seq2.length
EQUAL[(@seq1[self.seq1_index].upcase +
@seq2[self.seq2_index].upcase).to_sym]
end
end
# Method to check if a path is complete.
def ok?
if self.mismatches <= @max_mismatches and self.insertions <=
@max_insertions and self.deletions <= @max_deletions
if @seq1.length == self.matches + self.mismatches +
self.insertions and
@seq2.length == self.matches + self.mismatches + self.deletions
pp self if VERBOSE
true
end
end
end
# Method to check if the path is within the search space.
def in_bounds?
if self.seq1_index <= @seq1.length and self.seq2_index <=
@seq2.length
true
else
false
end
end
end
if VERBOSE
m = Matcher.new("atcg", "atcgX", mismatches=0, insertions=0,
deletions=1)
m.match
end
if TEST and __FILE__ == $PROGRAM_NAME
require "test/unit"
class TestMatcher < Test::Unit::TestCase
def test_perfect_match_returns_ok
m = Matcher.new("atcg", "atcg", mismatches=0, insertions=0,
deletions=0)
assert_not_nil(m.match)
end
def test_perfect_match_with_ambiguity_returns_ok
m = Matcher.new("atcg", "NNNN", mismatches=0, insertions=0,
deletions=0)
assert_not_nil(m.match)
end
def test_one_mismatch_with_zero_allowed_returns_nil
m = Matcher.new("atcg", "atGg", mismatches=0, insertions=0,
deletions=0)
assert_equal(nil, m.match)
end
def test_one_mismatch_with_one_allowed_returns_ok
m = Matcher.new("atcg", "atGg", mismatches=1, insertions=0,
deletions=0)
assert_not_nil(m.match)
end
def test_two_mismatch_with_one_allowed_returns_nil
m = Matcher.new("atcg", "GtGg", mismatches=1, insertions=0,
deletions=0)
assert_equal(nil, m.match)
end
def test_two_mismatch_with_two_allowed_returns_ok
m = Matcher.new("atcg", "GtGg", mismatches=2, insertions=0,
deletions=0)
assert_not_nil(m.match)
end
def test_one_insertion_with_zero_allowed_returns_nil
m = Matcher.new("atGcg", "atcg", mismatches=0, insertions=0,
deletions=0)
assert_equal(nil, m.match)
end
def test_one_insertion_with_one_allowed_returns_ok
m = Matcher.new("atGcg", "atcg", mismatches=0, insertions=1,
deletions=0)
assert_not_nil(m.match)
end
def test_two_insertion_with_one_allowed_returns_nil
m = Matcher.new("atGcgC", "atcg", mismatches=0, insertions=1,
deletions=0)
assert_equal(nil, m.match)
end
def test_two_insertion_with_two_allowed_returns_ok
m = Matcher.new("atGcgC", "atcg", mismatches=0, insertions=2,
deletions=0)
assert_not_nil(m.match)
end
def test_one_deletion_with_zero_allowed_returns_nil
m = Matcher.new("atcg", "atcAg", mismatches=0, insertions=0,
deletions=0)
assert_equal(nil, m.match)
end
def test_one_deletion_with_one_allowed_returns_ok
m = Matcher.new("atcg", "atcAg", mismatches=0, insertions=0,
deletions=1)
assert_not_nil(m.match)
end
def test_two_deletion_with_one_allowed_returns_nil
m = Matcher.new("atcg", "TatcAg", mismatches=0, insertions=0,
deletions=1)
assert_equal(nil, m.match)
end
def test_two_deletion_with_two_allowed_returns_ok
m = Matcher.new("atcg", "AatcTg", mismatches=0, insertions=0,
deletions=2)
assert_not_nil(m.match)
end
def test_one_mismatch_one_insertions_one_deletion_returns_ok
m = Matcher.new("TaGcg", "atcgA", mismatches=1, insertions=1,
deletions=1)
assert_not_nil(m.match)
end
end
end
--
Posted via http://www.ruby-forum.com/.