#! /usr/bin/python # Fast search using a Boyer-Moore inspired algorithm # This program is under GPL license. # Philippe Biondi test1={"t1":"abc"} test2={"t1":"abcefghj", "t4":"jjjjj", "t5":"222jjjj", "t2":"bczerzerzed", "t3":"zeeee"} MAXNB=99999999 def make_machine(dict): mac={"":[MAXNB,[]]} for k,v in dict.items(): if mac[""][0] > len(v): mac[""][0]=len(v) for i in range(0,len(v)): if (not mac.has_key(v[i])): mac[v[i]]=[MAXNB,[]] if (len(v)-i-1 > 0): if (mac[v[i]][0] >= len(v)-i): mac[v[i]][0]=len(v)-i-1 else: mac[v[i]][1].append((k,v)) for k in mac.keys(): if mac[k][0] > mac[""][0]: mac[k][0]=mac[""][0] return mac def search(mac,str): st="" p=mac[st][0] count=0 comp=0 future_comp=0 while (p < len(str)): count=count+1 if mac.has_key(str[p]): st=str[p] j=0 for name,pattern in mac[st][1]: i=0 while str[p-i-1]==pattern[len(pattern)-i-2]: i=i+1 comp=comp+i if i>j: j=i if pattern == str[p-len(pattern)+1:p+1]: print "Found %s [%s] at pos %i" % (name,pattern,p-len(pattern)+2) future_comp=future_comp+j else: st="" p=p+mac[st][0] print "lenstr=%i count=%i cmp=%i futurcomp=%i totalcmp=%i(%.1f%%) futurtot=%i(%.1f%%)" % \ (len(str),count,comp,future_comp, count+comp,100.0*(comp+count)/len(str), future_comp+count,100.0*(future_comp+count)/len(str)) mac=make_machine(test2) print mac search(mac,"1234567890123456789022222jjjjjabcefghjqsdqsbczerlkdsghqsldh lqskh lqskd hlskqfh lsqkfhkfezrhofierhfoiezhnvoiuhfovisudhfcoizeq,cfoisqdhfcoiuqshf oisqdfhnoiqushf oiqsuhfoiqzuhsfconiuqzhefnociuqzhfno iqsuhfo izquhrf oiqsuhfoiqsuhdvrofiuqzhecoi fuhqz oifuhqzeo ifuhqzoeifuhqzoeiucfhoaizuhfcoiqzuhfociqsuhfoiqushfoiquzhsfviqzhefiuvqzhefjvjfjjfjjjijijjjjjjjjezvqfhoiqsvdufhqisdfhivuezhviaqufhizqhefviqzuhfvizhefviqfezvjhqlesfjkvhqlksfjh zeeeeeh gwdsfg hlsdfjh sldfghldkgjhvsldkjghkezjhezkjhzelkjezhlzkejzehzekjhzelkezjhezlvkjhwdfkhfdlghdslfjhqlkshqpoifhzfhwksdjfhlk jqabcabdhsakldhgbaca shdkazdhacbabcjdsqfhklqsjdlabclmkjfmlqfjababclksdfmlsdhfeiguhsoizerbczerzerzedzeddazdazdbcd")