Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2# 

3# Copyright (c) 2020, the cclib development team 

4# 

5# This file is part of cclib (http://cclib.github.io) and is distributed under 

6# the terms of the BSD 3-Clause License. 

7"""Tools for identifying, reading and writing files and streams.""" 

8 

9import atexit 

10import io 

11import os 

12import sys 

13import re 

14from tempfile import NamedTemporaryFile 

15from urllib.request import urlopen 

16from urllib.error import URLError 

17 

18from cclib.parser import data 

19from cclib.parser import logfileparser 

20from cclib.parser.utils import find_package 

21 

22from cclib.parser.adfparser import ADF 

23from cclib.parser.daltonparser import DALTON 

24from cclib.parser.fchkparser import FChk 

25from cclib.parser.gamessparser import GAMESS 

26from cclib.parser.gamessukparser import GAMESSUK 

27from cclib.parser.gaussianparser import Gaussian 

28from cclib.parser.jaguarparser import Jaguar 

29from cclib.parser.molcasparser import Molcas 

30from cclib.parser.molproparser import Molpro 

31from cclib.parser.mopacparser import MOPAC 

32from cclib.parser.nwchemparser import NWChem 

33from cclib.parser.orcaparser import ORCA 

34from cclib.parser.psi3parser import Psi3 

35from cclib.parser.psi4parser import Psi4 

36from cclib.parser.qchemparser import QChem 

37from cclib.parser.turbomoleparser import Turbomole 

38 

39from cclib.io import cjsonreader 

40from cclib.io import cjsonwriter 

41from cclib.io import cmlwriter 

42from cclib.io import moldenwriter 

43from cclib.io import wfxwriter 

44from cclib.io import xyzreader 

45from cclib.io import xyzwriter 

46 

47_has_cclib2openbabel = find_package("openbabel") 

48if _has_cclib2openbabel: 

49 from cclib.bridge import cclib2openbabel 

50 

51_has_pandas = find_package("pandas") 

52if _has_pandas: 

53 import pandas as pd 

54 

55# Regular expression for validating URLs 

56URL_PATTERN = re.compile( 

57 

58 r'^(?:http|ftp)s?://' # http:// or https:// 

59 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... 

60 r'localhost|' # localhost... 

61 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip 

62 r'(?::\d+)?' # optional port 

63 r'(?:/?|[/?]\S+)$', re.IGNORECASE 

64 

65) 

66 

67# Parser choice is triggered by certain phrases occurring the logfile. Where these 

68# strings are unique, we can set the parser and break. In other cases, the situation 

69# is a little but more complicated. Here are the exceptions: 

70# 1. The GAMESS trigger also works for GAMESS-UK files, so we can't break 

71# after finding GAMESS in case the more specific phrase is found. 

72# 2. Molpro log files don't have the program header, but always contain 

73# the generic string 1PROGRAM, so don't break here either to be cautious. 

74# 3. "MOPAC" is used in some packages like GAMESS, so match MOPAC20## 

75# 

76# The triggers are defined by the tuples in the list below like so: 

77# (parser, phrases, flag whether we should break) 

78triggers = [ 

79 

80 (ADF, ["Amsterdam Density Functional"], True), 

81 (DALTON, ["Dalton - An Electronic Structure Program"], True), 

82 (FChk, ["Number of atoms", "I"], True), 

83 (GAMESS, ["GAMESS"], False), 

84 (GAMESS, ["GAMESS VERSION"], True), 

85 (GAMESSUK, ["G A M E S S - U K"], True), 

86 (Gaussian, ["Gaussian, Inc."], True), 

87 (Jaguar, ["Jaguar"], True), 

88 (Molcas, ["MOLCAS"], True), 

89 (Molpro, ["PROGRAM SYSTEM MOLPRO"], True), 

90 (Molpro, ["1PROGRAM"], False), 

91 (MOPAC, ["MOPAC20"], True), 

92 (NWChem, ["Northwest Computational Chemistry Package"], True), 

93 (ORCA, ["O R C A"], True), 

94 (Psi3, ["PSI3: An Open-Source Ab Initio Electronic Structure Package"], True), 

95 (Psi4, ["Psi4: An Open-Source Ab Initio Electronic Structure Package"], True), 

96 (QChem, ["A Quantum Leap Into The Future Of Chemistry"], True), 

97 (Turbomole, ["TURBOMOLE"], True), 

98 

99] 

100 

101readerclasses = { 

102 'cjson': cjsonreader.CJSON, 

103 'json': cjsonreader.CJSON, 

104 'xyz': xyzreader.XYZ, 

105} 

106 

107writerclasses = { 

108 'cjson': cjsonwriter.CJSON, 

109 'json': cjsonwriter.CJSON, 

110 'cml': cmlwriter.CML, 

111 'molden': moldenwriter.MOLDEN, 

112 'wfx': wfxwriter.WFXWriter, 

113 'xyz': xyzwriter.XYZ, 

114} 

115 

116 

117class UnknownOutputFormatError(Exception): 

118 """Raised when an unknown output format is encountered.""" 

119 

120 

121def guess_filetype(inputfile): 

122 """Try to guess the filetype by searching for trigger strings.""" 

123 if not inputfile: 

124 return None 

125 

126 filetype = None 

127 if isinstance(inputfile, str): 

128 for line in inputfile: 

129 for parser, phrases, do_break in triggers: 

130 if all([line.lower().find(p.lower()) >= 0 for p in phrases]): 

131 filetype = parser 

132 if do_break: 

133 return filetype 

134 else: 

135 for fname in inputfile: 

136 for line in inputfile: 

137 for parser, phrases, do_break in triggers: 

138 if all([line.lower().find(p.lower()) >= 0 for p in phrases]): 

139 filetype = parser 

140 if do_break: 

141 return filetype 

142 return filetype 

143 

144 

145def ccread(source, *args, **kwargs): 

146 """Attempt to open and read computational chemistry data from a file. 

147 

148 If the file is not appropriate for cclib parsers, a fallback mechanism 

149 will try to recognize some common chemistry formats and read those using 

150 the appropriate bridge such as Open Babel. 

151 

152 Inputs: 

153 source - a single logfile, a list of logfiles (for a single job), 

154 an input stream, or an URL pointing to a log file. 

155 *args, **kwargs - arguments and keyword arguments passed to ccopen 

156 Returns: 

157 a ccData object containing cclib data attributes 

158 """ 

159 

160 log = ccopen(source, *args, **kwargs) 

161 if log: 

162 if kwargs.get('verbose', None): 

163 print('Identified logfile to be in %s format' % log.logname) 

164 # If the input file is a CJSON file and not a standard compchemlog file 

165 cjson_as_input = kwargs.get("cjson", False) 

166 if cjson_as_input: 

167 return log.read_cjson() 

168 else: 

169 return log.parse() 

170 else: 

171 if kwargs.get('verbose', None): 

172 print('Attempting to use fallback mechanism to read file') 

173 return fallback(source) 

174 

175 

176def ccopen(source, *args, **kwargs): 

177 """Guess the identity of a particular log file and return an instance of it. 

178 

179 Inputs: 

180 source - a single logfile, a list of logfiles (for a single job), 

181 an input stream, or an URL pointing to a log file. 

182 *args, **kwargs - arguments and keyword arguments passed to filetype 

183 

184 Returns: 

185 one of ADF, DALTON, GAMESS, GAMESS UK, Gaussian, Jaguar, 

186 Molpro, MOPAC, NWChem, ORCA, Psi3, Psi/Psi4, QChem, CJSON or None 

187 (if it cannot figure it out or the file does not exist). 

188 """ 

189 inputfile = None 

190 is_stream = False 

191 

192 # Check if source is a link or contains links. Retrieve their content. 

193 # Try to open the logfile(s), using openlogfile, if the source is a string (filename) 

194 # or list of filenames. If it can be read, assume it is an open file object/stream. 

195 is_string = isinstance(source, str) 

196 is_url = True if is_string and URL_PATTERN.match(source) else False 

197 is_listofstrings = isinstance(source, list) and all([isinstance(s, str) for s in source]) 

198 if is_string or is_listofstrings: 

199 # Process links from list (download contents into temporary location) 

200 if is_listofstrings: 

201 filelist = [] 

202 for filename in source: 

203 if not URL_PATTERN.match(filename): 

204 filelist.append(filename) 

205 else: 

206 try: 

207 response = urlopen(filename) 

208 tfile = NamedTemporaryFile(delete=False) 

209 tfile.write(response.read()) 

210 # Close the file because Windows won't let open it second time 

211 tfile.close() 

212 filelist.append(tfile.name) 

213 # Delete temporary file when the program finishes 

214 atexit.register(os.remove, tfile.name) 

215 except (ValueError, URLError) as error: 

216 if not kwargs.get('quiet', False): 

217 (errno, strerror) = error.args 

218 return None 

219 source = filelist 

220 

221 if not is_url: 

222 try: 

223 inputfile = logfileparser.openlogfile(source) 

224 except IOError as error: 

225 if not kwargs.get('quiet', False): 

226 (errno, strerror) = error.args 

227 return None 

228 else: 

229 try: 

230 response = urlopen(source) 

231 is_stream = True 

232 

233 # Retrieve filename from URL if possible 

234 filename = re.findall(r"\w+\.\w+", source.split('/')[-1]) 

235 filename = filename[0] if filename else "" 

236 

237 inputfile = logfileparser.openlogfile(filename, object=response.read()) 

238 except (ValueError, URLError) as error: 

239 if not kwargs.get('quiet', False): 

240 (errno, strerror) = error.args 

241 return None 

242 

243 elif hasattr(source, "read"): 

244 inputfile = source 

245 is_stream = True 

246 

247 # Streams are tricky since they don't have seek methods or seek won't work 

248 # by design even if it is present. We solve this now by reading in the 

249 # entire stream and using a StringIO buffer for parsing. This might be 

250 # problematic for very large streams. Slow streams might also be an issue if 

251 # the parsing is not instantaneous, but we'll deal with such edge cases 

252 # as they arise. Ideally, in the future we'll create a class dedicated to 

253 # dealing with these issues, supporting both files and streams. 

254 if is_stream: 

255 try: 

256 inputfile.seek(0, 0) 

257 except (AttributeError, IOError): 

258 contents = inputfile.read() 

259 try: 

260 inputfile = io.StringIO(contents) 

261 except: 

262 inputfile = io.StringIO(unicode(contents)) 

263 inputfile.seek(0, 0) 

264 

265 # Proceed to return an instance of the logfile parser only if the filetype 

266 # could be guessed. Need to make sure the input file is closed before creating 

267 # an instance, because parsers will handle opening/closing on their own. 

268 filetype = guess_filetype(inputfile) 

269 

270 # If the input file isn't a standard compchem log file, try one of 

271 # the readers, falling back to Open Babel. 

272 if not filetype: 

273 if kwargs.get("cjson"): 

274 filetype = readerclasses['cjson'] 

275 elif source and not is_stream: 

276 ext = os.path.splitext(source)[1][1:].lower() 

277 for extension in readerclasses: 

278 if ext == extension: 

279 filetype = readerclasses[extension] 

280 

281 # Proceed to return an instance of the logfile parser only if the filetype 

282 # could be guessed. Need to make sure the input file is closed before creating 

283 # an instance, because parsers will handle opening/closing on their own. 

284 if filetype: 

285 # We're going to close and reopen below anyway, so this is just to avoid 

286 # the missing seek method for fileinput.FileInput. In the long run 

287 # we need to refactor to support for various input types in a more 

288 # centralized fashion. 

289 if is_listofstrings: 

290 pass 

291 else: 

292 inputfile.seek(0, 0) 

293 if not is_stream: 

294 if is_listofstrings: 

295 if filetype == Turbomole: 

296 source = sort_turbomole_outputs(source) 

297 inputfile.close() 

298 return filetype(source, *args, **kwargs) 

299 return filetype(inputfile, *args, **kwargs) 

300 

301 

302def fallback(source): 

303 """Attempt to read standard molecular formats using other libraries. 

304 

305 Currently this will read XYZ files with OpenBabel, but this can easily 

306 be extended to other formats and libraries, too. 

307 """ 

308 

309 if isinstance(source, str): 

310 ext = os.path.splitext(source)[1][1:].lower() 

311 if _has_cclib2openbabel: 

312 # From OB 3.0 onward, Pybel is contained inside the OB module. 

313 try: 

314 import openbabel.pybel as pb 

315 except: 

316 import pybel as pb 

317 if ext in pb.informats: 

318 return cclib2openbabel.readfile(source, ext) 

319 else: 

320 print("Could not import `openbabel`, fallback mechanism might not work.") 

321 

322 

323def ccwrite(ccobj, outputtype=None, outputdest=None, 

324 indices=None, terse=False, returnstr=False, 

325 *args, **kwargs): 

326 """Write the parsed data from an outputfile to a standard chemical 

327 representation. 

328 

329 Inputs: 

330 ccobj - Either a job (from ccopen) or a data (from job.parse()) object 

331 outputtype - The output format (should be a string) 

332 outputdest - A filename or file object for writing 

333 indices - One or more indices for extracting specific geometries/etc. (zero-based) 

334 terse - This option is currently limited to the cjson/json format. Whether to indent the cjson/json or not 

335 returnstr - Whether or not to return a string representation. 

336 

337 The different writers may take additional arguments, which are 

338 documented in their respective docstrings. 

339 

340 Returns: 

341 the string representation of the chemical datatype 

342 requested, or None. 

343 """ 

344 

345 # Determine the correct output format. 

346 outputclass = _determine_output_format(outputtype, outputdest) 

347 

348 # Is ccobj an job object (unparsed), or is it a ccdata object (parsed)? 

349 if isinstance(ccobj, logfileparser.Logfile): 

350 jobfilename = ccobj.filename 

351 ccdata = ccobj.parse() 

352 elif isinstance(ccobj, data.ccData): 

353 jobfilename = None 

354 ccdata = ccobj 

355 else: 

356 raise ValueError 

357 

358 # If the logfile name has been passed in through kwargs (such as 

359 # in the ccwrite script), make sure it has precedence. 

360 if 'jobfilename' in kwargs: 

361 jobfilename = kwargs['jobfilename'] 

362 # Avoid passing multiple times into the main call. 

363 del kwargs['jobfilename'] 

364 

365 outputobj = outputclass(ccdata, jobfilename=jobfilename, 

366 indices=indices, terse=terse, 

367 *args, **kwargs) 

368 output = outputobj.generate_repr() 

369 

370 # If outputdest isn't None, write the output to disk. 

371 if outputdest is not None: 

372 if isinstance(outputdest, str): 

373 with open(outputdest, 'w') as outputobj: 

374 outputobj.write(output) 

375 elif isinstance(outputdest, io.IOBase): 

376 outputdest.write(output) 

377 else: 

378 raise ValueError 

379 # If outputdest is None, return a string representation of the output. 

380 else: 

381 return output 

382 

383 if returnstr: 

384 return output 

385 

386 

387def _determine_output_format(outputtype, outputdest): 

388 """ 

389 Determine the correct output format. 

390 

391 Inputs: 

392 outputtype - a string corresponding to the file type 

393 outputdest - a filename string or file handle 

394 Returns: 

395 outputclass - the class corresponding to the correct output format 

396 Raises: 

397 UnknownOutputFormatError for unsupported file writer extensions 

398 """ 

399 

400 # Priority for determining the correct output format: 

401 # 1. outputtype 

402 # 2. outputdest 

403 

404 outputclass = None 

405 # First check outputtype. 

406 if isinstance(outputtype, str): 

407 extension = outputtype.lower() 

408 if extension in writerclasses: 

409 outputclass = writerclasses[extension] 

410 else: 

411 raise UnknownOutputFormatError(extension) 

412 else: 

413 # Then checkout outputdest. 

414 if isinstance(outputdest, str): 

415 extension = os.path.splitext(outputdest)[1].lower() 

416 elif isinstance(outputdest, io.IOBase): 

417 extension = os.path.splitext(outputdest.name)[1].lower() 

418 else: 

419 raise UnknownOutputFormatError 

420 if extension in writerclasses: 

421 outputclass = writerclasses[extension] 

422 else: 

423 raise UnknownOutputFormatError(extension) 

424 

425 return outputclass 

426 

427def path_leaf(path): 

428 """ 

429 Splits the path to give the filename. Works irrespective of '\' 

430 or '/' appearing in the path and also with path ending with '/' or '\'. 

431 

432 Inputs: 

433 path - a string path of a logfile. 

434 Returns: 

435 tail - 'directory/subdirectory/logfilename' will return 'logfilename'. 

436 ntpath.basename(head) - 'directory/subdirectory/logfilename/' will return 'logfilename'. 

437 """ 

438 head, tail = os.path.split(path) 

439 return tail or os.path.basename(head) 

440 

441def sort_turbomole_outputs(filelist): 

442 """ 

443 Sorts a list of inputs (or list of log files) according to the order 

444 defined below. Just appends the unknown files in the end of the sorted list. 

445 

446 Inputs: 

447 filelist - a list of Turbomole log files needed to be parsed. 

448 Returns: 

449 sorted_list - a sorted list of Turbomole files needed for proper parsing. 

450 """ 

451 sorting_order = { 

452 'basis' : 0, 

453 'control' : 1, 

454 'mos' : 2, 

455 'alpha' : 3, 

456 'beta' : 4, 

457 'job.last' : 5, 

458 'coord' : 6, 

459 'gradient' : 7, 

460 'aoforce' : 8, 

461 } 

462 

463 known_files = [] 

464 unknown_files = [] 

465 sorted_list = [] 

466 for fname in filelist: 

467 filename = path_leaf(fname) 

468 if filename in sorting_order: 

469 known_files.append([fname, sorting_order[filename]]) 

470 else: 

471 unknown_files.append(fname) 

472 for i in sorted(known_files, key=lambda x: x[1]): 

473 sorted_list.append(i[0]) 

474 if unknown_files: 

475 sorted_list.extend(unknown_files) 

476 return sorted_list 

477 

478 

479def _check_pandas(found_pandas): 

480 if not found_pandas: 

481 raise ImportError("You must install `pandas` to use this function") 

482 

483 

484def ccframe(ccobjs, *args, **kwargs): 

485 """Returns a pandas.DataFrame of data attributes parsed by cclib from one 

486 or more logfiles. 

487 

488 Inputs: 

489 ccobjs - an iterable of either cclib jobs (from ccopen) or data (from 

490 job.parse()) objects 

491 

492 Returns: 

493 a pandas.DataFrame 

494 """ 

495 _check_pandas(_has_pandas) 

496 logfiles = [] 

497 for ccobj in ccobjs: 

498 # Is ccobj an job object (unparsed), or is it a ccdata object (parsed)? 

499 if isinstance(ccobj, logfileparser.Logfile): 

500 jobfilename = ccobj.filename 

501 ccdata = ccobj.parse() 

502 elif isinstance(ccobj, data.ccData): 

503 jobfilename = None 

504 ccdata = ccobj 

505 else: 

506 raise ValueError 

507 

508 attributes = ccdata.getattributes() 

509 attributes.update({ 

510 'jobfilename': jobfilename 

511 }) 

512 

513 logfiles.append(pd.Series(attributes)) 

514 return pd.DataFrame(logfiles) 

515 

516 

517del find_package