Python-Ref > Cheminformatics > Molecular fingerprints > How to use fingerprints in similarity searches
 
 

<-^^

How to use fingerprints in similarity searches

Expand/Shrink
  1   import pybel
  2   
  3   f = file( 'structures.200.txt', 'r')
  4   mol1 = pybel.readstring( "smi", f.readline().strip().split("\t")[2])
  5   fp1 = mol1.calcfp()
  6   f.seek( 0)
  7   for i, line in enumerate( f):
  8     sm = line.split("\t")[2]
  9     mol = pybel.readstring( "smi", sm)
 10     fp = mol.calcfp()
 11     tanimoto = fp | fp1 # the | operator is overloaded to compute Tanimoto coefficient
 12     print "%.2f  %03d  %s" % (tanimoto, i, sm)
 13     if i > 20:
 14       break
 15   f.close()
stdout:
1.00  000  CCN1C=NC2=C1N=CN=C2N
0.02  001  C(CCl)Cl
0.04  002  C1=C(C=C(C(=C1O)O)O)O
0.03  003  C1=CC(=C(C=C1Cl)Cl)Cl
0.15  004  C1=CC(=C(C(=C1)O)O)C(=O)O
0.02  005  C(C=O)Cl
0.05  006  C(CCl)O
0.10  007  C(CC(=O)O)C(C(=O)O)O
0.10  008  C(C(=O)[O-])(C(=O)[O-])O
0.10  009  C(C(=O)O)(C(=O)O)O
0.10  010  C(CC(=O)O)C(=O)C(=O)O
0.10  011  CCC(=O)C(=O)O
0.16  012  CC(CC(C(=O)O)N)N
0.04  013  C1=CC(=C(C=C1Cl)O)Cl
0.12  014  C(CC(=O)C(=O)O)CC(=O)O
0.13  015  C1=CC(=C(C=C1C(=O)O)O)O
0.08  016  C(CN)C=O
0.11  017  C1=CC(=CN=C1)C#N
0.13  018  C(CC(=O)O)C(=O)CC(=O)O
0.11  019  CC(=O)CC(=O)O
0.09  020  C1=CC(=CC(=C1)O)C=O
0.11  021  C1=CC(=CC(=C1)O)CO
Doba běhu: 159.3 ms
Expand/Shrink
  1   import pybel
  2   
  3   f = file( 'structures.200.txt', 'r')
  4   mol1 = pybel.readstring( "smi", f.readline().strip().split("\t")[2])
  5   fp1 = mol1.calcfp()
  6   f.seek( 0)
  7   for i, line in enumerate( f):
  8     sm = line.split("\t")[2]
  9     mol = pybel.readstring( "smi", sm)
 10     fp = mol.calcfp()
 11     tanimoto = fp | fp1 # the | operator is overloaded to compute Tanimoto coefficient
 12     if tanimoto > 0.4:
 13       print "%.2f  %03d  %s" % (tanimoto, i, sm)
 14       mol.draw( show=False, filename="fingermol-%d.png"%i)
 15   f.close()
stdout:
1.00  000  CCN1C=NC2=C1N=CN=C2N
0.76  041  C1=NC2=C(N1)C(=NC=N2)N
0.41  140  C1=C(NC(=O)N=C1)N
0.44  174  C1=NC2=C(N1)C(=O)N=CN2
output image fingermol-140.png
output image fingermol-41.png
output image fingermol-174.png
output image fingermol-0.png
Doba běhu: 382.2 ms
The following example uses a simple database like file that we have seen earlier to demonstrate how the Tanimoto coefficient is calculated.
Expand/Shrink
  1   import pybel
  2   
  3   def fp_to_ascii( fp):
  4     return "".join( ["%08x"%num for num in fp])
  5       
  6   def ascii_to_fp( ascii):
  7     ret = []
  8     for i in range( 0, 256, 8):
  9       ret.append( int( ascii[i:i+8], 16))
 10     return ret
 11   
 12   def count_bits( num):
 13     count = 0
 14     while num:
 15       count += num & 1
 16       num >>= 1
 17     return count
 18   
 19   
 20   # we prepare a database-like file for later use
 21   f = file( 'structures.200.txt', 'r')
 22   db_file = file( "fingerprints.db", "w")
 23   for line in f:
 24     sm = line.split("\t")[2]
 25     mol = pybel.readstring( "smi", sm)
 26     fp = mol.calcfp()
 27     db_file.write( "%s %s\n" % (sm, fp_to_ascii( fp.fp)))
 28   f.close()
 29   db_file.close()
 30   
 31   # now we do some similarity searching
 32   db_file = file( "fingerprints.db", "r")
 33   smiles1, finger1 = db_file.readline().split()
 34   fp1 = ascii_to_fp( finger1)
 35   db_file.seek( 0)
 36   for line_num, line in enumerate( db_file):
 37     smiles, finger = line.split()
 38     fp = ascii_to_fp( finger)
 39     # now we calcualte tanimoto
 40     common, all = 0, 0
 41     for i, num in enumerate( fp):
 42       num1 = fp1[i]
 43       common += count_bits( num1 & num)
 44       all += count_bits( num1 | num)
 45     tanimoto = 1.0*common/all
 46     print "%.2f  %03d  %s" % (tanimoto, line_num, smiles)
 47     if line_num > 20:
 48       break
 49   db_file.close()
stdout:
1.00  000  CCN1C=NC2=C1N=CN=C2N
0.02  001  C(CCl)Cl
0.04  002  C1=C(C=C(C(=C1O)O)O)O
0.03  003  C1=CC(=C(C=C1Cl)Cl)Cl
0.15  004  C1=CC(=C(C(=C1)O)O)C(=O)O
0.02  005  C(C=O)Cl
0.05  006  C(CCl)O
0.10  007  C(CC(=O)O)C(C(=O)O)O
0.10  008  C(C(=O)[O-])(C(=O)[O-])O
0.10  009  C(C(=O)O)(C(=O)O)O
0.10  010  C(CC(=O)O)C(=O)C(=O)O
0.10  011  CCC(=O)C(=O)O
0.16  012  CC(CC(C(=O)O)N)N
0.04  013  C1=CC(=C(C=C1Cl)O)Cl
0.12  014  C(CC(=O)C(=O)O)CC(=O)O
0.13  015  C1=CC(=C(C=C1C(=O)O)O)O
0.08  016  C(CN)C=O
0.11  017  C1=CC(=CN=C1)C#N
0.13  018  C(CC(=O)O)C(=O)CC(=O)O
0.11  019  CC(=O)CC(=O)O
0.09  020  C1=CC(=CC(=C1)O)C=O
0.11  021  C1=CC(=CC(=C1)O)CO
Doba běhu: 282.2 ms