Assemble CRACMM species metadata from CMAQ


author: Havala O.T. Pye (pye.havala@epa.gov)
date: created 2022-09-12

updated: Nash Skipper
date: 2024-02-09

updated: Nash Skipper
date: 2024-03-28

updated: Havala Pye
date: 2025-02-18

updated: Michael Pye
date: 2025-02-27 

Notebook Description

This notebook collects data across the CMAQ model to create a table of species information in both csv and markdown formats. The csv version contains additional data not easily displayed in markdown. Output from this notebook is stored here . After clicking the link, select the directory that corresponds to the chemical mechanism of your choice to find the correct output files.

Download Notebook

Click here to download this tutorial as a Jupyter Notebook file.

CMAQ input files

  • AE_{mech}.nml

  • GC_{mech}.nml

  • NR_{mech}.nml

  • AERO_DATA.F

  • SOA_DEFN.F

  • hlconst.F

  • {mech}_speciesdescription.csv

Mechanisms supported

  • cracmm1_aq

  • cracmm1amore_aq

  • cracmm2

Setup libraries, paths, and function to prepare metadata

import os
import pandas as pd
import re
# Set current working directory where this file resides 
# This code expects *.nml and files from CMAQ to be present in ./input and will output in ./output
outputfiledir = os.path.join(os.getcwd(), 'output')
workdir = '/work/MOD3DEV/has/2023cracmm_ages/structurecuration/'
filepath = os.path.normpath(workdir)
os.chdir(filepath)
inputfiledir = os.path.join(os.getcwd(), 'input')
if not os.path.isdir('./output'):
    os.mkdir('./output')
# Function to prepare dataframe with metadata
def prep_metadata(mech):
    
    # uses CMAQ files: AE.nml, GC.nml, NR.nml. AERO_DATA.F, SOA_DEFN.F, hlconst.F, {mech}_speciesdescription.csv
    # mech input string should be cracmm1_aq or cracmm1amore_aq for CMAQv5.4
    #                             cracmm1_aq, cracmm1amore_aq, or cracmm2 for CMAQv5.5

    ###########################################
    # Prep Gases
    gcfile = 'GC_'+mech+'.nml'
    filename = os.path.join( inputfiledir, gcfile)
    dfgc = pd.read_csv(filename,skiprows=4)
    nrowdim=len(dfgc)
    dfgc=dfgc.drop([nrowdim-1]) #drop last row
    dfgc.columns=dfgc.columns.str.replace(' ','', regex=False)
    dfgc.rename(columns={"!SPECIES":"Species"}, inplace=True)
    dfgc['Species']=dfgc.Species.str.replace("'","", regex=False)
    dfgc['Species']=dfgc.Species.str.replace(" ","", regex=False)
    dfgc['PhaseG']='G'  # is it in the gas-phase? 
    dfgc=dfgc.drop(['GC2AESURR','GC2AQSURR','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP'],axis=1) 
    dfgc['Species']=dfgc.Species.str.replace('VROC','ROC', regex=False) # drop for matching with AE

    ###########################################
    # Prep NR
    nrfile = 'NR_'+mech+'.nml'
    filename = os.path.join( inputfiledir, nrfile)
    dfnr = pd.read_csv(filename,skiprows=4)
    nrowdim=len(dfnr)
    dfnr=dfnr.drop([nrowdim-1]) #drop last row
    dfnr.columns=dfnr.columns.str.replace(' ','', regex=False)
    dfnr.rename(columns={"!SPECIES":"Species"}, inplace=True)
    dfnr['Species']=dfnr.Species.str.replace("'","", regex=False)
    dfnr['Species']=dfnr.Species.str.replace(" ","", regex=False)
    dfnr['PhaseG']='G'
    dfnr=dfnr.drop(['NR2AESURR','NR2AQSURR','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP'],axis=1)  #these won't match other nml
    # Append NR to GC
    dfgc=pd.concat([dfgc, dfnr],ignore_index=True)
    dfgc['WET-SCAVSURR']=dfgc['WET-SCAVSURR'].str.replace("'","", regex=False)
    dfgc['WET-SCAVSURR']=dfgc['WET-SCAVSURR'].str.replace(" ","", regex=False)

    ###########################################
    #https://www.dataquest.io/wp-content/uploads/2019/03/python-regular-expressions-cheat-sheet.pdf
    # Prep hlconst, dissolution enthalpy for WET-SCAVSURR
    hlfile = 'hlconst.F'
    filename = os.path.join( inputfiledir, hlfile)
    column_names = ['hspecies','henryMatm','henryenthalpyK']
    dfhenry = pd.DataFrame(columns=column_names)
    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe
    filetoread = open(filename)
    for line in filetoread:
        line = line.rstrip()
        if re.search('^      DATA SUBNAME\(',line):
            hspecies=(re.findall('\)\s*/\s*\'(.*)\'.*!', line)[0]) # return name
            hlvalue=float(re.findall('DATA SUBNAME.*\/.*,(.*),.*\/.*!',line)[0]) # get the item between the first 2 commas between the slashes
            enthalpyK=float(re.findall('DATA SUBNAME.*\/.*,.*,(.*).*\/.*!',line)[0])
            newrow = pd.Series(data={'hspecies':hspecies,'henryMatm':hlvalue,
                                  'henryenthalpyK':enthalpyK})
            dfhenry = pd.concat([dfhenry, newrow.to_frame().T],ignore_index=True)
    dfhenry.hspecies=dfhenry.hspecies.str.replace(" ","", regex=False)
    dfgc=pd.merge(dfgc,dfhenry,left_on="WET-SCAVSURR",right_on="hspecies",how="left")

    ###########################################
    # Prep AE
    aefile = 'AE_'+mech+'.nml'
    filename = os.path.join( inputfiledir, aefile )
    dfae = pd.read_csv(filename,skiprows=4)
    nrowdim=len(dfae)
    dfae=dfae.drop([nrowdim-1]) #drop last row
    dfae.columns=dfae.columns.str.replace(' ','', regex=False) # get rid of spaces in column names
    dfae.rename(columns={"!SPECIES":"Species"}, inplace=True) # rename this heading
    dfae['Species']=dfae.Species.str.replace("'","", regex=False)  # get rid of ' in species names
    dfae['Species']=dfae.Species.str.replace(" ","", regex=False)  # get rid of spaces in species names
    dfae['PhaseP']='P' # particle phase
    dfae=dfae.drop(['AE2AQSURR','FAC.1','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP','OPTICS','DRYDEPSURR','WET-SCAVSURR'],axis=1)  #these won't match other nml

    ###########################################
    # Prep AERO_DATA and get density, kappa
    adfile = 'AERO_DATA.F'
    filename = os.path.join( inputfiledir, adfile)
    column_names = ['adspecies','aerodensity','aerokappa']
    dfad = pd.DataFrame(columns=column_names)
    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe
    filetoread = open(filename)
    for line in filetoread:
        line = line.rstrip()
        if re.search('^     & spcs_list_type\(',line):
            # one comment has () which is problematic, drop
            line=str.replace(line, '(Black)','Black')
            adspecies=(re.findall('^     & spcs_list_type\(\'(.*)\',.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\)', line)[0]) # return name
            aerodensity=float(re.findall('^     & spcs_list_type\(.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\)', line)[0]) 
            aerokappa=float(re.findall('^     & spcs_list_type\(.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,(.*)\)', line)[0]) 
            newrow = pd.Series(data={'adspecies':adspecies,'aerodensity':aerodensity,
                                  'aerokappa':aerokappa})
            dfad = pd.concat([dfad, newrow.to_frame().T],ignore_index=True)
    dfad.adspecies=dfad.adspecies.str.replace(" ","", regex=False)
    dfae=pd.merge(dfae,dfad,left_on="Species",right_on="adspecies",how="left")

    ###########################################
    # Prep SOA_DEFN
    oafile = 'SOA_DEFN.F'
    filename = os.path.join( inputfiledir, oafile)
    column_names = ['oaspecies','oacstar','oaenthalpy','oaotoc','oaomoc']
    dfoa = pd.DataFrame(columns=column_names)
    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe
    filetoread = open(filename)
    for line in filetoread:
        line = line.rstrip()
        if re.search('^     & oa_type\(',line):
            oaspecies=(      re.findall('^     & oa_type\(\'(.*)\',.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\)', line)[0]) # return name
            oacstar=float(   re.findall('^     & oa_type\(.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*,.*,.*\)', line)[0]) 
            oaenthalpy=float(re.findall('^     & oa_type\(.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*,.*\)', line)[0]) 
            oaotoc=float(    re.findall('^     & oa_type\(.*,.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*\)', line)[0]) 
            oaomoc=float(  re.findall('^     & oa_type\(.*,.*,.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*\)', line)[0]) 
            newrow = pd.Series(data={'oaspecies':oaspecies,'oacstar':oacstar,
                                  'oaenthalpy':oaenthalpy,'oaotoc':oaotoc,
                                   'oaomoc':oaomoc})
            #print(newrow)
            dfoa = pd.concat([dfoa, newrow.to_frame().T],ignore_index=True)
    dfoa.oaspecies=dfoa.oaspecies.str.replace(" ","", regex=False)
    dfae=pd.merge(dfae,dfoa,left_on="Species",right_on="oaspecies",how="left")

    # Finish formatting ae.nml info
    #dfae['Species']=dfae['Species'].str.strip().str[0:-1] # remove trailing I,J,K, needed for CMAQ v5.3 but not v5.4
    dfae['Species']=dfae.Species.str.replace('AROC','ROC', regex=False) # match these with gas
    dfae['Species']=dfae.Species.str.replace('AHOM','HOM', regex=False)  # match with gas
    dfae['Species']=dfae.Species.str.replace('AELHOM','ELHOM', regex=False) # match with gas
    dfae['Species']=dfae.Species.str.replace('AOP3','OP3', regex=False) # match with gas
    dfae['Species']=dfae.Species.str.replace('ATRPN','TRPN', regex=False) # match with gas
    dfae['Species']=dfae.Species.str.replace('AHONIT','HONIT', regex=False) # match with gas

    ###########################################
    # merge and add g (gas) or p (particle) suffix and do molec wt check
    dfgc=pd.merge(dfgc,dfae,on="Species",how="outer",suffixes=("_g","_p"))
    dfgc['chckmw']=dfgc['MOLWT_g']-dfgc['MOLWT_p'] # gas and particle molecular weights should match
    if len(dfgc[dfgc['chckmw']>0])>0:
        print(">>gas and particle molecular weights have an inconsistency<<")
        print(dfgc[dfgc['chckmw']>0])
    else:
        print(">>gas and particle molecular weights match<<")

    ###########################################
    # bring in descriptions
    filename = os.path.join( inputfiledir, mech+'_speciesdescription.csv')
    dfdesc = pd.read_csv(filename)
    dfdesc.columns=dfdesc.columns.str.replace(' ','', regex=False)
    dfdesc['Species']=dfdesc.Species.str.replace(' ','', regex=False)
    # need to remove spaces from species names
    dfgc= pd.merge(dfgc,dfdesc,left_on='Species',right_on='Species',how="left")
    # warning if no matching species description
    if dfgc[dfgc['Description'].isna()].size>0:
        for spc in dfgc[dfgc['Description'].isna()]['Species']:
            print(f'Warning: {spc} species description is missing')
        print(f'Check {mech}_speciesdescription.csv for missing species descriptions')

    ###########################################
    # Organize data sort alphabetical, take GC.nml value first
    dfgc = dfgc.sort_values("Species") # sort alphabetical
    dfgc["Phase"]=dfgc["PhaseG"].fillna('')+dfgc["PhaseP"].fillna('')
    dfgc['Molecular Weight (g/mol)']=dfgc['MOLWT_g'].fillna(dfgc['MOLWT_p'])
    dfgc['Explicit/Lumped']=dfgc['ExplicitorLumped_g'].fillna(dfgc['ExplicitorLumped_p'])
    dfgc['Representative']=dfgc['!RepCmp_g'].fillna(dfgc['!RepCmp_p']) 
    dfgc['Representative']=dfgc.Representative.str.replace("!","", regex=False)
    dfgc['DTXSID']=dfgc['DTXSID_g'].fillna(dfgc['DTXSID_p'])
    dfgc['DTXSID']=dfgc['DTXSID'].fillna('') 
    dfgc['DTXSID']=dfgc['DTXSID'].str.replace(' ','', regex=False) 
    dfgc['SMILES']=dfgc['SMILES_g'].fillna(dfgc['SMILES_p']) 
    dfgc['SMILES']=dfgc['SMILES'].str.replace(' ','', regex=False) 
    # Diagnose stable species based on them being transported in gas or aerosol
    dfgc['St']=dfgc['TRNS_g'].fillna('')+dfgc['TRNS_p'].fillna('')
    dfgc['St']=dfgc['St'].str.find('Yes')
    dfgc.loc[dfgc['St']>0,'Stable']='Yes'
    dfgc.loc[dfgc['St']<0,'Stable']='No'

    return dfgc

Prepare metadata for mechanism

Warnings will print if molecular weights differ across gas and particle phases. Paired gas-particle species are identified by a prepended A and V.

mech='cracmm2'
dfgc=prep_metadata(mech)
>>gas and particle molecular weights match<<

Save to Markdown File

###########################################
# Write out Markdown for CMAQ GitHub
###########################################
dfmarkdown = dfgc[['Species','Description','Phase','Molecular Weight (g/mol)','Explicit/Lumped','Representative','DTXSID','SMILES']].copy()
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('NA','', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('[','\[', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace(']','\]', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('(','\(', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace(')','\)', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].fillna('')

# Hyperlink SMILES to DTXSID entry in dashboard
dfmarkdown['SMILESfmt']= '['+ dfmarkdown.SMILES + '](https://comptox.epa.gov/dashboard/chemical/details/'+ dfmarkdown.DTXSID + ')' 
maskval = dfmarkdown.DTXSID.str.len()>5
dfmarkdown.loc[maskval,'SMILES']=dfmarkdown.loc[maskval,'SMILESfmt']
dfmarkdown=dfmarkdown.drop(['DTXSID'],axis=1)
dfmarkdown=dfmarkdown.drop(['SMILESfmt'],axis=1)

# assemble and format table header
headerline = ' <sub>Species</sub> | <sub>Description</sub> | <sub>Phase</sub> | <sub>Molecular Weight (g/mol)</sub> | <sub>Explicit/ Lumped</sub> | <sub>Representative Structure</sub> | <sub>SMILES</sub> '
firstmarkdownline = "Gas (G) and particle (P) species from the namelists. SMILES link to representative structures in the EPA Chemicals Dashboard (if available)."
secondmarkdownline = "Note that for each particulate species in CMAQ, a letter will be appended to the name to designate the size, or mode, of the aerosol being represented: I = Aitken mode, J = Accumulation mode, K = Coarse mode. Prepending of a species with a V or A in CMAQ or the chemical mechanism files indicates the species resides in the gas or particulate phase. "
dfmarkdown['Representative']=dfgc.Representative.str.replace(";",",", regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace(';',',', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('ug/m3','&#956;g m<sup>-3</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('log10C','log<sub>10</sub>C', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('kOH','k<sub>OH</sub>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('cm3','cm<sup>3</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('s-1','s<sup>-1</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-10','10<sup>-10</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-11','10<sup>-11</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-12','10<sup>-12</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-13','10<sup>-13</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-14','10<sup>-14</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-2','10<sup>-2</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-1','10<sup>-1</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+1','10<sup>+1</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+2','10<sup>+2</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+3','10<sup>+3</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+4','10<sup>+4</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+5','10<sup>+5</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+6','10<sup>+6</sup>', regex=False)

mdfile = mech+'_species_table.md'
filename = os.path.join( outputfiledir, mdfile)
mdfile= open(filename,'w')
mdfile.write(mech.upper() + ' Species Table')
mdfile.write('\n')
mdfile.write(firstmarkdownline)
mdfile.write('\n')
mdfile.write('\n')
mdfile.write(secondmarkdownline)
mdfile.write('\n')
mdfile.write('\n')
mdfile.write(headerline)
mdfile.write('\n')
mdfile.write(' ----- | ----- | ----- | ----- | ----- | ----- | ----- ')
mdfile.write('\n')
mdfile.close()
dfmarkdown.to_csv(filename,index=False,header=False,sep='|',mode='a')
mdfile= open(filename,'a')
mdfile.write('\n')
mdfile.close()

Save to csv file

############# Metadata CSV
dfmetadata=dfgc[['Species','Description','Phase','Stable','Molecular Weight (g/mol)',
                 'Explicit/Lumped','Representative','SMILES','DTXSID','henryMatm',
                 'henryenthalpyK','aerodensity','aerokappa','oacstar','oaenthalpy',
                 'oaomoc']].copy()
dfmetadata['aerokappa']=dfmetadata.aerokappa.mask(dfmetadata.aerokappa <= 0, 'NA')
dfmetadata['DTXSID']=dfmetadata.DTXSID.mask(dfmetadata.DTXSID == '', 'NA' )
dfmetadata=dfmetadata.rename(columns={'henryMatm':'H Law (M/atm)'})
dfmetadata=dfmetadata.rename(columns={'henryenthalpyK':'Enthalpy of solution (K)'})
dfmetadata=dfmetadata.rename(columns={'aerodensity':'Aerosol density (kg/m3)'})
dfmetadata=dfmetadata.rename(columns={'aerokappa':'Kappa_org'})
dfmetadata=dfmetadata.rename(columns={'oacstar':'C* (microg/m3)'})
dfmetadata=dfmetadata.rename(columns={'oaenthalpy':'Enthalpy of vaporization (J/mol)'})
dfmetadata=dfmetadata.rename(columns={'oaomoc':'OM to OC (g/g)'})

dfmetadata=dfmetadata.fillna('NA')

metafile = mech+'_metadata.csv'
filename = os.path.join( outputfiledir, metafile)
dfmetadata.to_csv(filename,index=False,header=True,sep=',',mode='w')