Python-Ref > Cheminformatics > Molecular fingerprints > How to use fingerprints in screening
 
 

<-^^->

How to use fingerprints in screening

The following example builds on our own implementation of fingerprints from previous slide. It shows that even a very simple and naive implementation of fingerprints can save a lot of work.
Expand/Shrink
  1   from oasa import smiles
  2   from oasa import periodic_table as pt
  3   import operator
  4   import time
  5   
  6   def atom_to_num( a):
  7     return pt.periodic_table[a.symbol]['ord']
  8   
  9   def get_fp( mol):
 10     frags = []
 11     for atom1 in mol.atoms:
 12       for edge2, atom2 in atom1.get_neighbor_edge_pairs():
 13         frags.append( (atom_to_num(atom1),edge2.order,atom_to_num(atom2)))
 14     fps2 = set( [sum( map( operator.mul, frag, [3,5,3])) % 256 for frag in frags])
 15     frags = []
 16     for atom1 in mol.atoms:
 17       for edge2, atom2 in atom1.get_neighbor_edge_pairs():
 18         for edge3, atom3 in atom2.get_neighbor_edge_pairs():
 19           if atom1 is not atom3:
 20             frags.append( (atom_to_num(atom1),edge2.order,atom_to_num(atom2),edge3.order,atom_to_num(atom3)))
 21     fps3 = set( [sum( map( operator.mul, frag, [3,5,7,5,3])) % 256 for frag in frags])
 22     return fps2 | fps3
 23   
 24   f = file( 'structures.200.txt', 'r')
 25   mols = []
 26   for line in f:
 27     sm = line.split("\t")[2]
 28     mol = smiles.text_to_mol( sm)
 29     fp = get_fp( mol)
 30     mols.append( (mol, fp))
 31   f.close()
 32   
 33   print len( mols)
 34   
 35   mol = smiles.text_to_mol( "O=C(O)C=C")
 36   fp2 = get_fp( mol)
 37   
 38   # brute force method
 39   res2 = []
 40   t = time.time()
 41   for amol, afp in mols:
 42     res2.append( amol.contains_substructure( mol))
 43   print "Brute force: %.1fms" % (1000*(time.time()-t))
 44   
 45   # screening
 46   res1 = []
 47   screen_excludes = 0
 48   t = time.time()
 49   for amol, afp in mols:
 50     if afp & fp2 == fp2:
 51       res1.append( amol.contains_substructure( mol))
 52     else:
 53       res1.append( False)
 54       screen_excludes += 1
 55   print "With fingerprint screening: %.1fms" % (1000*(time.time()-t))
 56   print "Screening excluded %d (%.1f%%) structures" % (screen_excludes,100.0*screen_excludes/len(res1))
 57   print "Are results equal:", res1 == res2
 58   print "Total hits:", len( filter( None, res1))
 59   
 60     
 61       
stdout:
200
Brute force: 55.1ms
With fingerprint screening: 6.6ms
Screening excluded 183 (91.5%) structures
Are results equal: True
Total hits: 12
Doba běhu: 816.7 ms
The examples below use OpenBabel and are much closer to the real-world scenario.
Expand/Shrink
  1   import pybel
  2   
  3   def fp_to_ascii( fp):
  4     return "".join( ["%08x"%num for num in fp])
  5       
  6   def ascii_to_fp( ascii):
  7     ret = []
  8     for i in range( 0, 256, 8):
  9       ret.append( int( ascii[i:i+8], 16))
 10     return ret
 11   
 12   # we prepare a database-like file for later use
 13   f = file( 'structures.200.txt', 'r')
 14   db_file = file( "fingerprints.db", "w")
 15   for line in f:
 16     sm = line.split("\t")[2]
 17     mol = pybel.readstring( "smi", sm)
 18     fp = mol.calcfp()
 19     db_file.write( "%s %s\n" % (sm, fp_to_ascii( fp.fp)))
 20   f.close()
 21   db_file.close()
 22   
 23   # now we use the file for search
 24   search_smiles = "C=CC(=O)O"
 25   search_mol = pybel.readstring( "smi", search_smiles)
 26   search_fp = list( search_mol.calcfp().fp)
 27   db_file = file( "fingerprints.db", "r")
 28   for line in db_file:
 29     smiles, finger = line.split()
 30     fp = ascii_to_fp( finger)
 31     match = True
 32     # we compare the fingerprints by individual 32-bit chunks
 33     for i,num in enumerate( fp):
 34       search_num = search_fp[i]
 35       if num & search_num != search_num:
 36         match = False
 37         break
 38     if match:
 39       print "HIT", smiles
 40       print bool( pybel.Smarts( search_smiles).findall( pybel.readstring("smi", smiles)))
 41   db_file.close()
 42       
stdout:
HIT C(=CC(=O)O)C=CC(=O)O
True
HIT C(C(=O)C=CC(=O)O)C(=O)O
True
HIT C(=CC(=O)O)C(=O)O
True
Doba běhu: 292.7 ms
Expand/Shrink
  1   import pybel
  2   import time
  3   
  4   def fp_to_ascii( fp):
  5     return "".join( ["%08x"%num for num in fp])
  6       
  7   def ascii_to_fp( ascii):
  8     ret = []
  9     for i in range( 0, 256, 8):
 10       ret.append( int( ascii[i:i+8], 16))
 11     return ret
 12   
 13   # we prepare a database-like file for later use
 14   f = file( 'structures.200.txt', 'r')
 15   db_file = file( "fingerprints.db", "w")
 16   for line in f:
 17     sm = line.split("\t")[2]
 18     mol = pybel.readstring( "smi", sm)
 19     fp = mol.calcfp()
 20     db_file.write( "%s %s\n" % (sm, fp_to_ascii( fp.fp)))
 21   f.close()
 22   db_file.close()
 23   
 24   # search using fingerprints
 25   t = time.time()
 26   search_smiles = "C=CC(=O)O"
 27   search_smarts = pybel.Smarts( search_smiles)
 28   search_mol = pybel.readstring( "smi", search_smiles)
 29   search_fp = list( search_mol.calcfp().fp)
 30   db_file = file( "fingerprints.db", "r")
 31   screen_hits = 0
 32   for line in db_file:
 33     smiles, finger = line.split()
 34     fp = ascii_to_fp( finger)
 35     match = True
 36     # we compare the fingerprints by individual 32-bit chunks
 37     for i,num in enumerate( fp):
 38       search_num = search_fp[i]
 39       if num & search_num != search_num:
 40         match = False
 41         break
 42     if match:
 43       if bool( search_smarts.findall( pybel.readstring("smi", smiles))):
 44         print "  HIT", smiles
 45         screen_hits += 1
 46   print "With fingerprint screening: %.1fms" % (1000*(time.time()-t))
 47   print "%d screening hits excluded %d (%.1f%%) molecules" % (screen_hits,
 48                                                               200-screen_hits,
 49                                                               100.0*(200-screen_hits)/200
 50                                                               )
 51   db_file.close()
 52   
 53   # brute force search
 54   t = time.time()
 55   search_smiles = "C=CC(=O)O"
 56   search_smarts = pybel.Smarts( search_smiles)
 57   db_file = file( "fingerprints.db", "r")
 58   for line in db_file:
 59     smiles, finger = line.split()
 60     if bool( search_smarts.findall( pybel.readstring("smi", smiles))):
 61       print "  HIT", smiles
 62   print "Brute force: %.1fms" % (1000*(time.time()-t))
 63   db_file.close()
 64   
 65       
stdout:
  HIT C(=CC(=O)O)C=CC(=O)O
  HIT C(C(=O)C=CC(=O)O)C(=O)O
  HIT C(=CC(=O)O)C(=O)O
With fingerprint screening: 7.7ms
3 screening hits excluded 197 (98.5%) molecules
  HIT C(=CC(=O)O)C=CC(=O)O
  HIT C(C(=O)C=CC(=O)O)C(=O)O
  HIT C(=CC(=O)O)C(=O)O
Brute force: 96.1ms
Doba běhu: 382.4 ms