Python-Ref > Cheminformatics > Pubchem > Power User Gateway (PUG) to PubChem
 
 

^^

Power User Gateway (PUG) to PubChem

It is possible to ask PubChem about structures in a programmatic way by using the Power User Gateway (PUG). This interface is based on a simple HTTP communication using XML files.
Because the server needs some time to process a request, the communication does not consist of a simple request and answer, but has an intermediate step in which the server sends "still working" signals to the client. The client can periodically check if its request was already processed by using a unique request id.
The following code builds a simple example of communication.
The first example just shows how to submit a request and receive its ID for later use.
Expand/Shrink
Zdroj: (pubchem1-1.py)
  1   query_temp = '''<?xml version="1.0"?>
  2   <!DOCTYPE PCT-Data PUBLIC "-//NCBI//NCBI PCTools/EN" "http://pubchem.ncbi.nlm.nih.gov/pug/pug.dtd">
  3   <PCT-Data>
  4     <PCT-Data_input>
  5       <PCT-InputData>
  6         <PCT-InputData_query>
  7           <PCT-Query>
  8             <PCT-Query_type>
  9               <PCT-QueryType>
 10                 <PCT-QueryType_css>
 11                   <PCT-QueryCompoundCS>
 12                     <PCT-QueryCompoundCS_query>
 13                       <PCT-QueryCompoundCS_query_data>%(query)s</PCT-QueryCompoundCS_query_data>
 14                     </PCT-QueryCompoundCS_query>
 15                     <PCT-QueryCompoundCS_type>
 16                       <PCT-QueryCompoundCS_type_identical>
 17                         <PCT-CSIdentity value="same-isotope">4</PCT-CSIdentity>
 18                       </PCT-QueryCompoundCS_type_identical>
 19                     </PCT-QueryCompoundCS_type>
 20                     <PCT-QueryCompoundCS_results>100</PCT-QueryCompoundCS_results>
 21                   </PCT-QueryCompoundCS>
 22                 </PCT-QueryType_css>
 23               </PCT-QueryType>
 24             </PCT-Query_type>
 25           </PCT-Query>
 26         </PCT-InputData_query>
 27       </PCT-InputData>
 28     </PCT-Data_input>
 29   </PCT-Data>
 30   '''
 31   
 32   import urllib2
 33   site = "http://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi"
 34   
 35   import time
 36   import re
 37   
 38   query = query_temp % {"query": "c1ccccc1C"}
 39   f = urllib2.urlopen( site, query)
 40   out = f.read()
 41   f.close()
 42   print out
stdout:
<?xml version="1.0"?>
<!DOCTYPE PCT-Data PUBLIC "-//NCBI//NCBI PCTools/EN" "http://pubchem.ncbi.nlm.nih.gov/pug/pug.dtd">
<PCT-Data>
  <PCT-Data_output>
    <PCT-OutputData>
      <PCT-OutputData_status>
        <PCT-Status-Message>
          <PCT-Status-Message_status>
            <PCT-Status value="queued"/>
          </PCT-Status-Message_status>
        </PCT-Status-Message>
      </PCT-OutputData_status>
      <PCT-OutputData_output>
        <PCT-OutputData_output_waiting>
          <PCT-Waiting>
            <PCT-Waiting_reqid>571336780819832445</PCT-Waiting_reqid>
            <PCT-Waiting_message>Structure search job was submitted</PCT-Waiting_message>
          </PCT-Waiting>
        </PCT-OutputData_output_waiting>
      </PCT-OutputData_output>
    </PCT-OutputData>
  </PCT-Data_output>
</PCT-Data>

Doba běhu: 2500.1 ms
The second example is more elaborate and shows how obtain a CIDs from a request.
Expand/Shrink
Zdroj: (pubchem1-2.py)
  1   query_temp = '''<?xml version="1.0"?>
  2   <!DOCTYPE PCT-Data PUBLIC "-//NCBI//NCBI PCTools/EN" "http://pubchem.ncbi.nlm.nih.gov/pug/pug.dtd">
  3   <PCT-Data>
  4     <PCT-Data_input>
  5       <PCT-InputData>
  6         <PCT-InputData_query>
  7           <PCT-Query>
  8             <PCT-Query_type>
  9               <PCT-QueryType>
 10                 <PCT-QueryType_css>
 11                   <PCT-QueryCompoundCS>
 12                     <PCT-QueryCompoundCS_query>
 13                       <PCT-QueryCompoundCS_query_data>%(query)s</PCT-QueryCompoundCS_query_data>
 14                     </PCT-QueryCompoundCS_query>
 15                     <PCT-QueryCompoundCS_type>
 16                       <PCT-QueryCompoundCS_type_identical>
 17                         <PCT-CSIdentity value="same-isotope">4</PCT-CSIdentity>
 18                       </PCT-QueryCompoundCS_type_identical>
 19                     </PCT-QueryCompoundCS_type>
 20                     <PCT-QueryCompoundCS_results>100</PCT-QueryCompoundCS_results>
 21                   </PCT-QueryCompoundCS>
 22                 </PCT-QueryType_css>
 23               </PCT-QueryType>
 24             </PCT-Query_type>
 25           </PCT-Query>
 26         </PCT-InputData_query>
 27       </PCT-InputData>
 28     </PCT-Data_input>
 29   </PCT-Data>
 30   '''
 31   
 32   check_temp = '''<PCT-Data>
 33     <PCT-Data_input>
 34       <PCT-InputData>
 35         <PCT-InputData_request>
 36           <PCT-Request>
 37             <PCT-Request_reqid>%(reqid)s</PCT-Request_reqid>
 38             <PCT-Request_type value="status"/>
 39           </PCT-Request>
 40         </PCT-InputData_request>
 41       </PCT-InputData>
 42     </PCT-Data_input>
 43   </PCT-Data>'''
 44   
 45   cid_url = '''http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=xml&rettype=uilist&WebEnvRq=1&db=pccompound&query_key=%(query_key)s&WebEnv=%(webenv)s'''
 46   
 47   import urllib2
 48   site = "http://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi"
 49   
 50   import time
 51   import re
 52   
 53   def pug_query( query):
 54       f = urllib2.urlopen( site, query)
 55       out = f.read()
 56       f.close()
 57       while '<PCT-Status value="queued"/>' in out or '<PCT-Status value="running"/>' in out:
 58           time.sleep( 0.5)
 59           print "waiting.."
 60           m = re.search( "<PCT-Waiting_reqid>(\d+)</PCT-Waiting_reqid>", out)
 61           if m:
 62               reqid = m.group( 1)
 63               out = check_query( reqid)
 64       return out
 65   
 66   def check_query( reqid):
 67       check = check_temp % locals()
 68       f = urllib2.urlopen( site, check)
 69       out = f.read()
 70       f.close()
 71       return out
 72   
 73   def get_query_key_and_webenv_from_result( text):
 74       m = re.search( "<PCT-Entrez_query-key>(\d+)</PCT-Entrez_query-key>", text)
 75       assert m
 76       query_key = m.group( 1)
 77       m = re.search( "<PCT-Entrez_webenv>(.+?)</PCT-Entrez_webenv>", text)
 78       assert m 
 79       webenv = m.group( 1)
 80       return query_key, webenv
 81   
 82   
 83   # make the query to structure
 84   query_text = query_temp % {'query':"c1ccccc1C"}
 85   result = pug_query( query_text)
 86   # extract query_key and webenv from the result
 87   query_key, webenv = get_query_key_and_webenv_from_result( result)
 88   # ask for CIDs
 89   url = cid_url % locals()
 90   f = urllib2.urlopen( url)
 91   out = f.read()
 92   f.close()
 93   cids = re.findall( "<Id>(\d+)</Id>", out)
 94   print cids
stdout:
waiting..
waiting..
waiting..
waiting..
waiting..
waiting..
waiting..
['1140']
Doba běhu: 10955.0 ms
The third example goes even further and downloads the results in form of a SDF file.
Expand/Shrink
Zdroj: (pubchem1-3.py)
  1   query_temp = '''<?xml version="1.0"?>
  2   <!DOCTYPE PCT-Data PUBLIC "-//NCBI//NCBI PCTools/EN" "http://pubchem.ncbi.nlm.nih.gov/pug/pug.dtd">
  3   <PCT-Data>
  4     <PCT-Data_input>
  5       <PCT-InputData>
  6         <PCT-InputData_query>
  7           <PCT-Query>
  8             <PCT-Query_type>
  9               <PCT-QueryType>
 10                 <PCT-QueryType_css>
 11                   <PCT-QueryCompoundCS>
 12                     <PCT-QueryCompoundCS_query>
 13                       <PCT-QueryCompoundCS_query_data>%(query)s</PCT-QueryCompoundCS_query_data>
 14                     </PCT-QueryCompoundCS_query>
 15                     <PCT-QueryCompoundCS_type>
 16                       <PCT-QueryCompoundCS_type_identical>
 17                         <PCT-CSIdentity value="same-isotope">4</PCT-CSIdentity>
 18                       </PCT-QueryCompoundCS_type_identical>
 19                     </PCT-QueryCompoundCS_type>
 20                     <PCT-QueryCompoundCS_results>100</PCT-QueryCompoundCS_results>
 21                   </PCT-QueryCompoundCS>
 22                 </PCT-QueryType_css>
 23               </PCT-QueryType>
 24             </PCT-Query_type>
 25           </PCT-Query>
 26         </PCT-InputData_query>
 27       </PCT-InputData>
 28     </PCT-Data_input>
 29   </PCT-Data>
 30   '''
 31   
 32   check_temp = '''<PCT-Data>
 33     <PCT-Data_input>
 34       <PCT-InputData>
 35         <PCT-InputData_request>
 36           <PCT-Request>
 37             <PCT-Request_reqid>%(reqid)s</PCT-Request_reqid>
 38             <PCT-Request_type value="status"/>
 39           </PCT-Request>
 40         </PCT-InputData_request>
 41       </PCT-InputData>
 42     </PCT-Data_input>
 43   </PCT-Data>'''
 44   
 45   download_temp = '''<PCT-Data>
 46     <PCT-Data_input>
 47       <PCT-InputData>
 48         <PCT-InputData_download>
 49           <PCT-Download>
 50             <PCT-Download_uids>
 51               <PCT-QueryUids>
 52                 <PCT-QueryUids_entrez>
 53                 <PCT-Entrez>
 54                 <PCT-Entrez_db>pccompound</PCT-Entrez_db>
 55                 <PCT-Entrez_query-key>%(query_key)s</PCT-Entrez_query-key>
 56                 <PCT-Entrez_webenv>%(webenv)s</PCT-Entrez_webenv>
 57                 </PCT-Entrez>
 58                 </PCT-QueryUids_entrez>
 59               </PCT-QueryUids>
 60             </PCT-Download_uids>
 61             <PCT-Download_format value="sdf"/>
 62             <PCT-Download_compression value="gzip"/>
 63           </PCT-Download>
 64         </PCT-InputData_download>
 65       </PCT-InputData>
 66     </PCT-Data_input>
 67   </PCT-Data>'''
 68   
 69   cid_url = '''http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=xml&rettype=uilist&WebEnvRq=1&db=pccompound&query_key=%(query_key)s&WebEnv=%(webenv)s'''
 70   
 71   import urllib2
 72   site = "http://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi"
 73   
 74   import time
 75   import re
 76   
 77   def pug_query( query):
 78       f = urllib2.urlopen( site, query)
 79       out = f.read()
 80       f.close()
 81       while '<PCT-Status value="queued"/>' in out or '<PCT-Status value="running"/>' in out:
 82           time.sleep( 0.5)
 83           m = re.search( "<PCT-Waiting_reqid>(\d+)</PCT-Waiting_reqid>", out)
 84           if m:
 85               reqid = m.group( 1)
 86               out = check_query( reqid)
 87       return out
 88   
 89   def check_query( reqid):
 90       check = check_temp % locals()
 91       f = urllib2.urlopen( site, check)
 92       out = f.read()
 93       f.close()
 94       return out
 95   
 96   def get_query_key_and_webenv_from_result( text):
 97       m = re.search( "<PCT-Entrez_query-key>(\d+)</PCT-Entrez_query-key>", text)
 98       assert m
 99       query_key = m.group( 1)
100       m = re.search( "<PCT-Entrez_webenv>(.+?)</PCT-Entrez_webenv>", text)
101       assert m 
102       webenv = m.group( 1)
103       return query_key, webenv
104   
105   
106   # make the query to structure
107   query_text = query_temp % {'query':"c1ccccc1C"}
108   result = pug_query( query_text)
109   # extract query_key and webenv from the result
110   query_key, webenv = get_query_key_and_webenv_from_result( result)
111   # ask for result SDF file
112   download_query = download_temp % globals()
113   result = pug_query( download_query)
114   url = re.search( "<PCT-Download-URL_url>(.*)</PCT-Download-URL_url>", result).group(1)
115   print url
116   # download the SDF into a temp file
117   sdf = urllib2.urlopen( url)
118   tempfile = file( "temp.sdf.gz", "wb")
119   tempfile.write( sdf.read())
120   tempfile.close()
121   sdf.close()
122   # process the file using pybel
123   import pybel
124   for mol in pybel.readfile( "sdf", "temp.sdf.gz"):
125       print mol
stdout:
ftp://ftp-private.ncbi.nlm.nih.gov/pubchem/.fetch/77537037372397294.sdf.gz
c1(ccccc1)C	1140

Doba běhu: 10737.8 ms