Coverage for cclib/io/ccio.py: 72%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

5# This file is part of cclib (http://cclib.github.io) and is distributed under

6# the terms of the BSD 3-Clause License.

7"""Tools for identifying, reading and writing files and streams."""

9import atexit

10import io

11import os

12import sys

13import re

14from tempfile import NamedTemporaryFile

15from urllib.request import urlopen

16from urllib.error import URLError

18from cclib.parser import data

19from cclib.parser import logfileparser

20from cclib.parser.utils import find_package

22from cclib.parser.adfparser import ADF

23from cclib.parser.daltonparser import DALTON

24from cclib.parser.fchkparser import FChk

25from cclib.parser.gamessparser import GAMESS

26from cclib.parser.gamessukparser import GAMESSUK

27from cclib.parser.gaussianparser import Gaussian

28from cclib.parser.jaguarparser import Jaguar

29from cclib.parser.molcasparser import Molcas

30from cclib.parser.molproparser import Molpro

31from cclib.parser.mopacparser import MOPAC

32from cclib.parser.nwchemparser import NWChem

33from cclib.parser.orcaparser import ORCA

34from cclib.parser.psi3parser import Psi3

35from cclib.parser.psi4parser import Psi4

36from cclib.parser.qchemparser import QChem

37from cclib.parser.turbomoleparser import Turbomole

39from cclib.io import cjsonreader

40from cclib.io import cjsonwriter

41from cclib.io import cmlwriter

42from cclib.io import moldenwriter

43from cclib.io import wfxwriter

44from cclib.io import xyzreader

45from cclib.io import xyzwriter

47_has_cclib2openbabel = find_package("openbabel")

48if _has_cclib2openbabel:

49 from cclib.bridge import cclib2openbabel

51_has_pandas = find_package("pandas")

52if _has_pandas:

53 import pandas as pd

55# Regular expression for validating URLs

56URL_PATTERN = re.compile(

58 r'^(?:http|ftp)s?://' # http:// or https://

59 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...

60 r'localhost|' # localhost...

61 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip

62 r'(?::\d+)?' # optional port

63 r'(?:/?|[/?]\S+)$', re.IGNORECASE

65)

67# Parser choice is triggered by certain phrases occurring the logfile. Where these

68# strings are unique, we can set the parser and break. In other cases, the situation

69# is a little but more complicated. Here are the exceptions:

70# 1. The GAMESS trigger also works for GAMESS-UK files, so we can't break

71# after finding GAMESS in case the more specific phrase is found.

72# 2. Molpro log files don't have the program header, but always contain

73# the generic string 1PROGRAM, so don't break here either to be cautious.

74# 3. "MOPAC" is used in some packages like GAMESS, so match MOPAC20##

75#

76# The triggers are defined by the tuples in the list below like so:

77# (parser, phrases, flag whether we should break)

78triggers = [

80 (ADF, ["Amsterdam Density Functional"], True),

81 (DALTON, ["Dalton - An Electronic Structure Program"], True),

82 (FChk, ["Number of atoms", "I"], True),

83 (GAMESS, ["GAMESS"], False),

84 (GAMESS, ["GAMESS VERSION"], True),

85 (GAMESSUK, ["G A M E S S - U K"], True),

86 (Gaussian, ["Gaussian, Inc."], True),

87 (Jaguar, ["Jaguar"], True),

88 (Molcas, ["MOLCAS"], True),

89 (Molpro, ["PROGRAM SYSTEM MOLPRO"], True),

90 (Molpro, ["1PROGRAM"], False),

91 (MOPAC, ["MOPAC20"], True),

92 (NWChem, ["Northwest Computational Chemistry Package"], True),

93 (ORCA, ["O R C A"], True),

94 (Psi3, ["PSI3: An Open-Source Ab Initio Electronic Structure Package"], True),

95 (Psi4, ["Psi4: An Open-Source Ab Initio Electronic Structure Package"], True),

96 (QChem, ["A Quantum Leap Into The Future Of Chemistry"], True),

97 (Turbomole, ["TURBOMOLE"], True),

99]

100

101readerclasses = {

102 'cjson': cjsonreader.CJSON,

103 'json': cjsonreader.CJSON,

104 'xyz': xyzreader.XYZ,

105}

106

107writerclasses = {

108 'cjson': cjsonwriter.CJSON,

109 'json': cjsonwriter.CJSON,

110 'cml': cmlwriter.CML,

111 'molden': moldenwriter.MOLDEN,

112 'wfx': wfxwriter.WFXWriter,

113 'xyz': xyzwriter.XYZ,

114}

115

116

117class UnknownOutputFormatError(Exception):

118 """Raised when an unknown output format is encountered."""

119

120

121def guess_filetype(inputfile):

122 """Try to guess the filetype by searching for trigger strings."""

123 if not inputfile:

124 return None

125

126 filetype = None

127 if isinstance(inputfile, str):

128 for line in inputfile:

129 for parser, phrases, do_break in triggers:

130 if all([line.lower().find(p.lower()) >= 0 for p in phrases]):

131 filetype = parser

132 if do_break:

133 return filetype

134 else:

135 for fname in inputfile:

136 for line in inputfile:

137 for parser, phrases, do_break in triggers:

138 if all([line.lower().find(p.lower()) >= 0 for p in phrases]):

139 filetype = parser

140 if do_break:

141 return filetype

142 return filetype

143

144

145def ccread(source, *args, **kwargs):

146 """Attempt to open and read computational chemistry data from a file.

147

148 If the file is not appropriate for cclib parsers, a fallback mechanism

149 will try to recognize some common chemistry formats and read those using

150 the appropriate bridge such as Open Babel.

151

152 Inputs:

153 source - a single logfile, a list of logfiles (for a single job),

154 an input stream, or an URL pointing to a log file.

155 *args, **kwargs - arguments and keyword arguments passed to ccopen

156 Returns:

157 a ccData object containing cclib data attributes

158 """

159

160 log = ccopen(source, *args, **kwargs)

161 if log:

162 if kwargs.get('verbose', None):

163 print('Identified logfile to be in %s format' % log.logname)

164 # If the input file is a CJSON file and not a standard compchemlog file

165 cjson_as_input = kwargs.get("cjson", False)

166 if cjson_as_input:

167 return log.read_cjson()

168 else:

169 return log.parse()

170 else:

171 if kwargs.get('verbose', None):

172 print('Attempting to use fallback mechanism to read file')

173 return fallback(source)

174

175

176def ccopen(source, *args, **kwargs):

177 """Guess the identity of a particular log file and return an instance of it.

178

179 Inputs:

180 source - a single logfile, a list of logfiles (for a single job),

181 an input stream, or an URL pointing to a log file.

182 *args, **kwargs - arguments and keyword arguments passed to filetype

183

184 Returns:

185 one of ADF, DALTON, GAMESS, GAMESS UK, Gaussian, Jaguar,

186 Molpro, MOPAC, NWChem, ORCA, Psi3, Psi/Psi4, QChem, CJSON or None

187 (if it cannot figure it out or the file does not exist).

188 """

189 inputfile = None

190 is_stream = False

191

192 # Check if source is a link or contains links. Retrieve their content.

193 # Try to open the logfile(s), using openlogfile, if the source is a string (filename)

194 # or list of filenames. If it can be read, assume it is an open file object/stream.

195 is_string = isinstance(source, str)

196 is_url = True if is_string and URL_PATTERN.match(source) else False

197 is_listofstrings = isinstance(source, list) and all([isinstance(s, str) for s in source])

198 if is_string or is_listofstrings:

199 # Process links from list (download contents into temporary location)

200 if is_listofstrings:

201 filelist = []

202 for filename in source:

203 if not URL_PATTERN.match(filename):

204 filelist.append(filename)

205 else:

206 try:

207 response = urlopen(filename)

208 tfile = NamedTemporaryFile(delete=False)

209 tfile.write(response.read())

210 # Close the file because Windows won't let open it second time

211 tfile.close()

212 filelist.append(tfile.name)

213 # Delete temporary file when the program finishes

214 atexit.register(os.remove, tfile.name)

215 except (ValueError, URLError) as error:

216 if not kwargs.get('quiet', False):

217 (errno, strerror) = error.args

218 return None

219 source = filelist

220

221 if not is_url:

222 try:

223 inputfile = logfileparser.openlogfile(source)

224 except IOError as error:

225 if not kwargs.get('quiet', False):

226 (errno, strerror) = error.args

227 return None

228 else:

229 try:

230 response = urlopen(source)

231 is_stream = True

232

233 # Retrieve filename from URL if possible

234 filename = re.findall(r"\w+\.\w+", source.split('/')[-1])

235 filename = filename[0] if filename else ""

236

237 inputfile = logfileparser.openlogfile(filename, object=response.read())

238 except (ValueError, URLError) as error:

239 if not kwargs.get('quiet', False):

240 (errno, strerror) = error.args

241 return None

242

243 elif hasattr(source, "read"):

244 inputfile = source

245 is_stream = True

246

247 # Streams are tricky since they don't have seek methods or seek won't work

248 # by design even if it is present. We solve this now by reading in the

249 # entire stream and using a StringIO buffer for parsing. This might be

250 # problematic for very large streams. Slow streams might also be an issue if

251 # the parsing is not instantaneous, but we'll deal with such edge cases

252 # as they arise. Ideally, in the future we'll create a class dedicated to

253 # dealing with these issues, supporting both files and streams.

254 if is_stream:

255 try:

256 inputfile.seek(0, 0)

257 except (AttributeError, IOError):

258 contents = inputfile.read()

259 try:

260 inputfile = io.StringIO(contents)

261 except:

262 inputfile = io.StringIO(unicode(contents))

263 inputfile.seek(0, 0)

264

265 # Proceed to return an instance of the logfile parser only if the filetype

266 # could be guessed. Need to make sure the input file is closed before creating

267 # an instance, because parsers will handle opening/closing on their own.

268 filetype = guess_filetype(inputfile)

269

270 # If the input file isn't a standard compchem log file, try one of

271 # the readers, falling back to Open Babel.

272 if not filetype:

273 if kwargs.get("cjson"):

274 filetype = readerclasses['cjson']

275 elif source and not is_stream:

276 ext = os.path.splitext(source)[1][1:].lower()

277 for extension in readerclasses:

278 if ext == extension:

279 filetype = readerclasses[extension]

280

281 # Proceed to return an instance of the logfile parser only if the filetype

282 # could be guessed. Need to make sure the input file is closed before creating

283 # an instance, because parsers will handle opening/closing on their own.

284 if filetype:

285 # We're going to close and reopen below anyway, so this is just to avoid

286 # the missing seek method for fileinput.FileInput. In the long run

287 # we need to refactor to support for various input types in a more

288 # centralized fashion.

289 if is_listofstrings:

290 pass

291 else:

292 inputfile.seek(0, 0)

293 if not is_stream:

294 if is_listofstrings:

295 if filetype == Turbomole:

296 source = sort_turbomole_outputs(source)

297 inputfile.close()

298 return filetype(source, *args, **kwargs)

299 return filetype(inputfile, *args, **kwargs)

300

301

302def fallback(source):

303 """Attempt to read standard molecular formats using other libraries.

304

305 Currently this will read XYZ files with OpenBabel, but this can easily

306 be extended to other formats and libraries, too.

307 """

308

309 if isinstance(source, str):

310 ext = os.path.splitext(source)[1][1:].lower()

311 if _has_cclib2openbabel:

312 # From OB 3.0 onward, Pybel is contained inside the OB module.

313 try:

314 import openbabel.pybel as pb

315 except:

316 import pybel as pb

317 if ext in pb.informats:

318 return cclib2openbabel.readfile(source, ext)

319 else:

320 print("Could not import `openbabel`, fallback mechanism might not work.")

321

322

323def ccwrite(ccobj, outputtype=None, outputdest=None,

324 indices=None, terse=False, returnstr=False,

325 *args, **kwargs):

326 """Write the parsed data from an outputfile to a standard chemical

327 representation.

328

329 Inputs:

330 ccobj - Either a job (from ccopen) or a data (from job.parse()) object

331 outputtype - The output format (should be a string)

332 outputdest - A filename or file object for writing

333 indices - One or more indices for extracting specific geometries/etc. (zero-based)

334 terse - This option is currently limited to the cjson/json format. Whether to indent the cjson/json or not

335 returnstr - Whether or not to return a string representation.

336

337 The different writers may take additional arguments, which are

338 documented in their respective docstrings.

339

340 Returns:

341 the string representation of the chemical datatype

342 requested, or None.

343 """

344

345 # Determine the correct output format.

346 outputclass = _determine_output_format(outputtype, outputdest)

347

348 # Is ccobj an job object (unparsed), or is it a ccdata object (parsed)?

349 if isinstance(ccobj, logfileparser.Logfile):

350 jobfilename = ccobj.filename

351 ccdata = ccobj.parse()

352 elif isinstance(ccobj, data.ccData):

353 jobfilename = None

354 ccdata = ccobj

355 else:

356 raise ValueError

357

358 # If the logfile name has been passed in through kwargs (such as

359 # in the ccwrite script), make sure it has precedence.

360 if 'jobfilename' in kwargs:

361 jobfilename = kwargs['jobfilename']

362 # Avoid passing multiple times into the main call.

363 del kwargs['jobfilename']

364

365 outputobj = outputclass(ccdata, jobfilename=jobfilename,

366 indices=indices, terse=terse,

367 *args, **kwargs)

368 output = outputobj.generate_repr()

369

370 # If outputdest isn't None, write the output to disk.

371 if outputdest is not None:

372 if isinstance(outputdest, str):

373 with open(outputdest, 'w') as outputobj:

374 outputobj.write(output)

375 elif isinstance(outputdest, io.IOBase):

376 outputdest.write(output)

377 else:

378 raise ValueError

379 # If outputdest is None, return a string representation of the output.

380 else:

381 return output

382

383 if returnstr:

384 return output

385

386

387def _determine_output_format(outputtype, outputdest):

388 """

389 Determine the correct output format.

390

391 Inputs:

392 outputtype - a string corresponding to the file type

393 outputdest - a filename string or file handle

394 Returns:

395 outputclass - the class corresponding to the correct output format

396 Raises:

397 UnknownOutputFormatError for unsupported file writer extensions

398 """

399

400 # Priority for determining the correct output format:

401 # 1. outputtype

402 # 2. outputdest

403

404 outputclass = None

405 # First check outputtype.

406 if isinstance(outputtype, str):

407 extension = outputtype.lower()

408 if extension in writerclasses:

409 outputclass = writerclasses[extension]

410 else:

411 raise UnknownOutputFormatError(extension)

412 else:

413 # Then checkout outputdest.

414 if isinstance(outputdest, str):

415 extension = os.path.splitext(outputdest)[1].lower()

416 elif isinstance(outputdest, io.IOBase):

417 extension = os.path.splitext(outputdest.name)[1].lower()

418 else:

419 raise UnknownOutputFormatError

420 if extension in writerclasses:

421 outputclass = writerclasses[extension]

422 else:

423 raise UnknownOutputFormatError(extension)

424

425 return outputclass

426

427def path_leaf(path):

428 """

429 Splits the path to give the filename. Works irrespective of '\'

430 or '/' appearing in the path and also with path ending with '/' or '\'.

431

432 Inputs:

433 path - a string path of a logfile.

434 Returns:

435 tail - 'directory/subdirectory/logfilename' will return 'logfilename'.

436 ntpath.basename(head) - 'directory/subdirectory/logfilename/' will return 'logfilename'.

437 """

438 head, tail = os.path.split(path)

439 return tail or os.path.basename(head)

440

441def sort_turbomole_outputs(filelist):

442 """

443 Sorts a list of inputs (or list of log files) according to the order

444 defined below. Just appends the unknown files in the end of the sorted list.

445

446 Inputs:

447 filelist - a list of Turbomole log files needed to be parsed.

448 Returns:

449 sorted_list - a sorted list of Turbomole files needed for proper parsing.

450 """

451 sorting_order = {

452 'basis' : 0,

453 'control' : 1,

454 'mos' : 2,

455 'alpha' : 3,

456 'beta' : 4,

457 'job.last' : 5,

458 'coord' : 6,

459 'gradient' : 7,

460 'aoforce' : 8,

461 }

462

463 known_files = []

464 unknown_files = []

465 sorted_list = []

466 for fname in filelist:

467 filename = path_leaf(fname)

468 if filename in sorting_order:

469 known_files.append([fname, sorting_order[filename]])

470 else:

471 unknown_files.append(fname)

472 for i in sorted(known_files, key=lambda x: x[1]):

473 sorted_list.append(i[0])

474 if unknown_files:

475 sorted_list.extend(unknown_files)

476 return sorted_list

477

478

479def _check_pandas(found_pandas):

480 if not found_pandas:

481 raise ImportError("You must install `pandas` to use this function")

482

483

484def ccframe(ccobjs, *args, **kwargs):

485 """Returns a pandas.DataFrame of data attributes parsed by cclib from one

486 or more logfiles.

487

488 Inputs:

489 ccobjs - an iterable of either cclib jobs (from ccopen) or data (from

490 job.parse()) objects

491

492 Returns:

493 a pandas.DataFrame

494 """

495 _check_pandas(_has_pandas)

496 logfiles = []

497 for ccobj in ccobjs:

498 # Is ccobj an job object (unparsed), or is it a ccdata object (parsed)?

499 if isinstance(ccobj, logfileparser.Logfile):

500 jobfilename = ccobj.filename

501 ccdata = ccobj.parse()

502 elif isinstance(ccobj, data.ccData):

503 jobfilename = None

504 ccdata = ccobj

505 else:

506 raise ValueError

507

508 attributes = ccdata.getattributes()

509 attributes.update({

510 'jobfilename': jobfilename

511 })

512

513 logfiles.append(pd.Series(attributes))

514 return pd.DataFrame(logfiles)

515

516

517del find_package

Coverage for cclib/io/ccio.py : 72%

239 statements

Coverage for cclib/io/ccio.py : 72%

239 statements 171 run 68 missing 0 excluded

239 statements