Coverage for cclib/io/ccio.py : 72%
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2#
3# Copyright (c) 2020, the cclib development team
4#
5# This file is part of cclib (http://cclib.github.io) and is distributed under
6# the terms of the BSD 3-Clause License.
7"""Tools for identifying, reading and writing files and streams."""
9import atexit
10import io
11import os
12import sys
13import re
14from tempfile import NamedTemporaryFile
15from urllib.request import urlopen
16from urllib.error import URLError
18from cclib.parser import data
19from cclib.parser import logfileparser
20from cclib.parser.utils import find_package
22from cclib.parser.adfparser import ADF
23from cclib.parser.daltonparser import DALTON
24from cclib.parser.fchkparser import FChk
25from cclib.parser.gamessparser import GAMESS
26from cclib.parser.gamessukparser import GAMESSUK
27from cclib.parser.gaussianparser import Gaussian
28from cclib.parser.jaguarparser import Jaguar
29from cclib.parser.molcasparser import Molcas
30from cclib.parser.molproparser import Molpro
31from cclib.parser.mopacparser import MOPAC
32from cclib.parser.nwchemparser import NWChem
33from cclib.parser.orcaparser import ORCA
34from cclib.parser.psi3parser import Psi3
35from cclib.parser.psi4parser import Psi4
36from cclib.parser.qchemparser import QChem
37from cclib.parser.turbomoleparser import Turbomole
39from cclib.io import cjsonreader
40from cclib.io import cjsonwriter
41from cclib.io import cmlwriter
42from cclib.io import moldenwriter
43from cclib.io import wfxwriter
44from cclib.io import xyzreader
45from cclib.io import xyzwriter
47_has_cclib2openbabel = find_package("openbabel")
48if _has_cclib2openbabel:
49 from cclib.bridge import cclib2openbabel
51_has_pandas = find_package("pandas")
52if _has_pandas:
53 import pandas as pd
55# Regular expression for validating URLs
56URL_PATTERN = re.compile(
58 r'^(?:http|ftp)s?://' # http:// or https://
59 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
60 r'localhost|' # localhost...
61 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
62 r'(?::\d+)?' # optional port
63 r'(?:/?|[/?]\S+)$', re.IGNORECASE
65)
67# Parser choice is triggered by certain phrases occurring the logfile. Where these
68# strings are unique, we can set the parser and break. In other cases, the situation
69# is a little but more complicated. Here are the exceptions:
70# 1. The GAMESS trigger also works for GAMESS-UK files, so we can't break
71# after finding GAMESS in case the more specific phrase is found.
72# 2. Molpro log files don't have the program header, but always contain
73# the generic string 1PROGRAM, so don't break here either to be cautious.
74# 3. "MOPAC" is used in some packages like GAMESS, so match MOPAC20##
75#
76# The triggers are defined by the tuples in the list below like so:
77# (parser, phrases, flag whether we should break)
78triggers = [
80 (ADF, ["Amsterdam Density Functional"], True),
81 (DALTON, ["Dalton - An Electronic Structure Program"], True),
82 (FChk, ["Number of atoms", "I"], True),
83 (GAMESS, ["GAMESS"], False),
84 (GAMESS, ["GAMESS VERSION"], True),
85 (GAMESSUK, ["G A M E S S - U K"], True),
86 (Gaussian, ["Gaussian, Inc."], True),
87 (Jaguar, ["Jaguar"], True),
88 (Molcas, ["MOLCAS"], True),
89 (Molpro, ["PROGRAM SYSTEM MOLPRO"], True),
90 (Molpro, ["1PROGRAM"], False),
91 (MOPAC, ["MOPAC20"], True),
92 (NWChem, ["Northwest Computational Chemistry Package"], True),
93 (ORCA, ["O R C A"], True),
94 (Psi3, ["PSI3: An Open-Source Ab Initio Electronic Structure Package"], True),
95 (Psi4, ["Psi4: An Open-Source Ab Initio Electronic Structure Package"], True),
96 (QChem, ["A Quantum Leap Into The Future Of Chemistry"], True),
97 (Turbomole, ["TURBOMOLE"], True),
99]
101readerclasses = {
102 'cjson': cjsonreader.CJSON,
103 'json': cjsonreader.CJSON,
104 'xyz': xyzreader.XYZ,
105}
107writerclasses = {
108 'cjson': cjsonwriter.CJSON,
109 'json': cjsonwriter.CJSON,
110 'cml': cmlwriter.CML,
111 'molden': moldenwriter.MOLDEN,
112 'wfx': wfxwriter.WFXWriter,
113 'xyz': xyzwriter.XYZ,
114}
117class UnknownOutputFormatError(Exception):
118 """Raised when an unknown output format is encountered."""
121def guess_filetype(inputfile):
122 """Try to guess the filetype by searching for trigger strings."""
123 if not inputfile:
124 return None
126 filetype = None
127 if isinstance(inputfile, str):
128 for line in inputfile:
129 for parser, phrases, do_break in triggers:
130 if all([line.lower().find(p.lower()) >= 0 for p in phrases]):
131 filetype = parser
132 if do_break:
133 return filetype
134 else:
135 for fname in inputfile:
136 for line in inputfile:
137 for parser, phrases, do_break in triggers:
138 if all([line.lower().find(p.lower()) >= 0 for p in phrases]):
139 filetype = parser
140 if do_break:
141 return filetype
142 return filetype
145def ccread(source, *args, **kwargs):
146 """Attempt to open and read computational chemistry data from a file.
148 If the file is not appropriate for cclib parsers, a fallback mechanism
149 will try to recognize some common chemistry formats and read those using
150 the appropriate bridge such as Open Babel.
152 Inputs:
153 source - a single logfile, a list of logfiles (for a single job),
154 an input stream, or an URL pointing to a log file.
155 *args, **kwargs - arguments and keyword arguments passed to ccopen
156 Returns:
157 a ccData object containing cclib data attributes
158 """
160 log = ccopen(source, *args, **kwargs)
161 if log:
162 if kwargs.get('verbose', None):
163 print('Identified logfile to be in %s format' % log.logname)
164 # If the input file is a CJSON file and not a standard compchemlog file
165 cjson_as_input = kwargs.get("cjson", False)
166 if cjson_as_input:
167 return log.read_cjson()
168 else:
169 return log.parse()
170 else:
171 if kwargs.get('verbose', None):
172 print('Attempting to use fallback mechanism to read file')
173 return fallback(source)
176def ccopen(source, *args, **kwargs):
177 """Guess the identity of a particular log file and return an instance of it.
179 Inputs:
180 source - a single logfile, a list of logfiles (for a single job),
181 an input stream, or an URL pointing to a log file.
182 *args, **kwargs - arguments and keyword arguments passed to filetype
184 Returns:
185 one of ADF, DALTON, GAMESS, GAMESS UK, Gaussian, Jaguar,
186 Molpro, MOPAC, NWChem, ORCA, Psi3, Psi/Psi4, QChem, CJSON or None
187 (if it cannot figure it out or the file does not exist).
188 """
189 inputfile = None
190 is_stream = False
192 # Check if source is a link or contains links. Retrieve their content.
193 # Try to open the logfile(s), using openlogfile, if the source is a string (filename)
194 # or list of filenames. If it can be read, assume it is an open file object/stream.
195 is_string = isinstance(source, str)
196 is_url = True if is_string and URL_PATTERN.match(source) else False
197 is_listofstrings = isinstance(source, list) and all([isinstance(s, str) for s in source])
198 if is_string or is_listofstrings:
199 # Process links from list (download contents into temporary location)
200 if is_listofstrings:
201 filelist = []
202 for filename in source:
203 if not URL_PATTERN.match(filename):
204 filelist.append(filename)
205 else:
206 try:
207 response = urlopen(filename)
208 tfile = NamedTemporaryFile(delete=False)
209 tfile.write(response.read())
210 # Close the file because Windows won't let open it second time
211 tfile.close()
212 filelist.append(tfile.name)
213 # Delete temporary file when the program finishes
214 atexit.register(os.remove, tfile.name)
215 except (ValueError, URLError) as error:
216 if not kwargs.get('quiet', False):
217 (errno, strerror) = error.args
218 return None
219 source = filelist
221 if not is_url:
222 try:
223 inputfile = logfileparser.openlogfile(source)
224 except IOError as error:
225 if not kwargs.get('quiet', False):
226 (errno, strerror) = error.args
227 return None
228 else:
229 try:
230 response = urlopen(source)
231 is_stream = True
233 # Retrieve filename from URL if possible
234 filename = re.findall(r"\w+\.\w+", source.split('/')[-1])
235 filename = filename[0] if filename else ""
237 inputfile = logfileparser.openlogfile(filename, object=response.read())
238 except (ValueError, URLError) as error:
239 if not kwargs.get('quiet', False):
240 (errno, strerror) = error.args
241 return None
243 elif hasattr(source, "read"):
244 inputfile = source
245 is_stream = True
247 # Streams are tricky since they don't have seek methods or seek won't work
248 # by design even if it is present. We solve this now by reading in the
249 # entire stream and using a StringIO buffer for parsing. This might be
250 # problematic for very large streams. Slow streams might also be an issue if
251 # the parsing is not instantaneous, but we'll deal with such edge cases
252 # as they arise. Ideally, in the future we'll create a class dedicated to
253 # dealing with these issues, supporting both files and streams.
254 if is_stream:
255 try:
256 inputfile.seek(0, 0)
257 except (AttributeError, IOError):
258 contents = inputfile.read()
259 try:
260 inputfile = io.StringIO(contents)
261 except:
262 inputfile = io.StringIO(unicode(contents))
263 inputfile.seek(0, 0)
265 # Proceed to return an instance of the logfile parser only if the filetype
266 # could be guessed. Need to make sure the input file is closed before creating
267 # an instance, because parsers will handle opening/closing on their own.
268 filetype = guess_filetype(inputfile)
270 # If the input file isn't a standard compchem log file, try one of
271 # the readers, falling back to Open Babel.
272 if not filetype:
273 if kwargs.get("cjson"):
274 filetype = readerclasses['cjson']
275 elif source and not is_stream:
276 ext = os.path.splitext(source)[1][1:].lower()
277 for extension in readerclasses:
278 if ext == extension:
279 filetype = readerclasses[extension]
281 # Proceed to return an instance of the logfile parser only if the filetype
282 # could be guessed. Need to make sure the input file is closed before creating
283 # an instance, because parsers will handle opening/closing on their own.
284 if filetype:
285 # We're going to close and reopen below anyway, so this is just to avoid
286 # the missing seek method for fileinput.FileInput. In the long run
287 # we need to refactor to support for various input types in a more
288 # centralized fashion.
289 if is_listofstrings:
290 pass
291 else:
292 inputfile.seek(0, 0)
293 if not is_stream:
294 if is_listofstrings:
295 if filetype == Turbomole:
296 source = sort_turbomole_outputs(source)
297 inputfile.close()
298 return filetype(source, *args, **kwargs)
299 return filetype(inputfile, *args, **kwargs)
302def fallback(source):
303 """Attempt to read standard molecular formats using other libraries.
305 Currently this will read XYZ files with OpenBabel, but this can easily
306 be extended to other formats and libraries, too.
307 """
309 if isinstance(source, str):
310 ext = os.path.splitext(source)[1][1:].lower()
311 if _has_cclib2openbabel:
312 # From OB 3.0 onward, Pybel is contained inside the OB module.
313 try:
314 import openbabel.pybel as pb
315 except:
316 import pybel as pb
317 if ext in pb.informats:
318 return cclib2openbabel.readfile(source, ext)
319 else:
320 print("Could not import `openbabel`, fallback mechanism might not work.")
323def ccwrite(ccobj, outputtype=None, outputdest=None,
324 indices=None, terse=False, returnstr=False,
325 *args, **kwargs):
326 """Write the parsed data from an outputfile to a standard chemical
327 representation.
329 Inputs:
330 ccobj - Either a job (from ccopen) or a data (from job.parse()) object
331 outputtype - The output format (should be a string)
332 outputdest - A filename or file object for writing
333 indices - One or more indices for extracting specific geometries/etc. (zero-based)
334 terse - This option is currently limited to the cjson/json format. Whether to indent the cjson/json or not
335 returnstr - Whether or not to return a string representation.
337 The different writers may take additional arguments, which are
338 documented in their respective docstrings.
340 Returns:
341 the string representation of the chemical datatype
342 requested, or None.
343 """
345 # Determine the correct output format.
346 outputclass = _determine_output_format(outputtype, outputdest)
348 # Is ccobj an job object (unparsed), or is it a ccdata object (parsed)?
349 if isinstance(ccobj, logfileparser.Logfile):
350 jobfilename = ccobj.filename
351 ccdata = ccobj.parse()
352 elif isinstance(ccobj, data.ccData):
353 jobfilename = None
354 ccdata = ccobj
355 else:
356 raise ValueError
358 # If the logfile name has been passed in through kwargs (such as
359 # in the ccwrite script), make sure it has precedence.
360 if 'jobfilename' in kwargs:
361 jobfilename = kwargs['jobfilename']
362 # Avoid passing multiple times into the main call.
363 del kwargs['jobfilename']
365 outputobj = outputclass(ccdata, jobfilename=jobfilename,
366 indices=indices, terse=terse,
367 *args, **kwargs)
368 output = outputobj.generate_repr()
370 # If outputdest isn't None, write the output to disk.
371 if outputdest is not None:
372 if isinstance(outputdest, str):
373 with open(outputdest, 'w') as outputobj:
374 outputobj.write(output)
375 elif isinstance(outputdest, io.IOBase):
376 outputdest.write(output)
377 else:
378 raise ValueError
379 # If outputdest is None, return a string representation of the output.
380 else:
381 return output
383 if returnstr:
384 return output
387def _determine_output_format(outputtype, outputdest):
388 """
389 Determine the correct output format.
391 Inputs:
392 outputtype - a string corresponding to the file type
393 outputdest - a filename string or file handle
394 Returns:
395 outputclass - the class corresponding to the correct output format
396 Raises:
397 UnknownOutputFormatError for unsupported file writer extensions
398 """
400 # Priority for determining the correct output format:
401 # 1. outputtype
402 # 2. outputdest
404 outputclass = None
405 # First check outputtype.
406 if isinstance(outputtype, str):
407 extension = outputtype.lower()
408 if extension in writerclasses:
409 outputclass = writerclasses[extension]
410 else:
411 raise UnknownOutputFormatError(extension)
412 else:
413 # Then checkout outputdest.
414 if isinstance(outputdest, str):
415 extension = os.path.splitext(outputdest)[1].lower()
416 elif isinstance(outputdest, io.IOBase):
417 extension = os.path.splitext(outputdest.name)[1].lower()
418 else:
419 raise UnknownOutputFormatError
420 if extension in writerclasses:
421 outputclass = writerclasses[extension]
422 else:
423 raise UnknownOutputFormatError(extension)
425 return outputclass
427def path_leaf(path):
428 """
429 Splits the path to give the filename. Works irrespective of '\'
430 or '/' appearing in the path and also with path ending with '/' or '\'.
432 Inputs:
433 path - a string path of a logfile.
434 Returns:
435 tail - 'directory/subdirectory/logfilename' will return 'logfilename'.
436 ntpath.basename(head) - 'directory/subdirectory/logfilename/' will return 'logfilename'.
437 """
438 head, tail = os.path.split(path)
439 return tail or os.path.basename(head)
441def sort_turbomole_outputs(filelist):
442 """
443 Sorts a list of inputs (or list of log files) according to the order
444 defined below. Just appends the unknown files in the end of the sorted list.
446 Inputs:
447 filelist - a list of Turbomole log files needed to be parsed.
448 Returns:
449 sorted_list - a sorted list of Turbomole files needed for proper parsing.
450 """
451 sorting_order = {
452 'basis' : 0,
453 'control' : 1,
454 'mos' : 2,
455 'alpha' : 3,
456 'beta' : 4,
457 'job.last' : 5,
458 'coord' : 6,
459 'gradient' : 7,
460 'aoforce' : 8,
461 }
463 known_files = []
464 unknown_files = []
465 sorted_list = []
466 for fname in filelist:
467 filename = path_leaf(fname)
468 if filename in sorting_order:
469 known_files.append([fname, sorting_order[filename]])
470 else:
471 unknown_files.append(fname)
472 for i in sorted(known_files, key=lambda x: x[1]):
473 sorted_list.append(i[0])
474 if unknown_files:
475 sorted_list.extend(unknown_files)
476 return sorted_list
479def _check_pandas(found_pandas):
480 if not found_pandas:
481 raise ImportError("You must install `pandas` to use this function")
484def ccframe(ccobjs, *args, **kwargs):
485 """Returns a pandas.DataFrame of data attributes parsed by cclib from one
486 or more logfiles.
488 Inputs:
489 ccobjs - an iterable of either cclib jobs (from ccopen) or data (from
490 job.parse()) objects
492 Returns:
493 a pandas.DataFrame
494 """
495 _check_pandas(_has_pandas)
496 logfiles = []
497 for ccobj in ccobjs:
498 # Is ccobj an job object (unparsed), or is it a ccdata object (parsed)?
499 if isinstance(ccobj, logfileparser.Logfile):
500 jobfilename = ccobj.filename
501 ccdata = ccobj.parse()
502 elif isinstance(ccobj, data.ccData):
503 jobfilename = None
504 ccdata = ccobj
505 else:
506 raise ValueError
508 attributes = ccdata.getattributes()
509 attributes.update({
510 'jobfilename': jobfilename
511 })
513 logfiles.append(pd.Series(attributes))
514 return pd.DataFrame(logfiles)
517del find_package