Assemble CRACMM species metadata from CMAQ

author: Havala O.T. Pye (pye.havala@epa.gov)
date: created 2022-09-12

updated: Nash Skipper
date: 2024-02-09

updated: Nash Skipper
date: 2024-03-28

updated: Havala Pye
date: 2025-02-18

updated: Michael Pye
date: 2025-02-27 

Notebook Description

This notebook collects data across the CMAQ model to create a table of species information in both csv and markdown formats. The csv version contains additional data not easily displayed in markdown. Output from this notebook is stored here . After clicking the link, select the directory that corresponds to the chemical mechanism of your choice to find the correct output files.

Download Notebook

Click here to download this tutorial as a Jupyter Notebook file.

CMAQ input files

AE_{mech}.nml
GC_{mech}.nml
NR_{mech}.nml
AERO_DATA.F
SOA_DEFN.F
hlconst.F
{mech}_speciesdescription.csv

Mechanisms supported

cracmm1_aq
cracmm1amore_aq
cracmm2

Setup libraries, paths, and function to prepare metadata

import os
import pandas as pd
import re

# Set current working directory where this file resides 
# This code expects *.nml and files from CMAQ to be present in ./input and will output in ./output
outputfiledir = os.path.join(os.getcwd(), 'output')
workdir = '/work/MOD3DEV/has/2023cracmm_ages/structurecuration/'
filepath = os.path.normpath(workdir)
os.chdir(filepath)
inputfiledir = os.path.join(os.getcwd(), 'input')
if not os.path.isdir('./output'):
    os.mkdir('./output')

# Function to prepare dataframe with metadata
def prep_metadata(mech):
    
    # uses CMAQ files: AE.nml, GC.nml, NR.nml. AERO_DATA.F, SOA_DEFN.F, hlconst.F, {mech}_speciesdescription.csv
    # mech input string should be cracmm1_aq or cracmm1amore_aq for CMAQv5.4
    #                             cracmm1_aq, cracmm1amore_aq, or cracmm2 for CMAQv5.5

    ###########################################
    # Prep Gases
    gcfile = 'GC_'+mech+'.nml'
    filename = os.path.join( inputfiledir, gcfile)
    dfgc = pd.read_csv(filename,skiprows=4)
    nrowdim=len(dfgc)
    dfgc=dfgc.drop([nrowdim-1]) #drop last row
    dfgc.columns=dfgc.columns.str.replace(' ','', regex=False)
    dfgc.rename(columns={"!SPECIES":"Species"}, inplace=True)
    dfgc['Species']=dfgc.Species.str.replace("'","", regex=False)
    dfgc['Species']=dfgc.Species.str.replace(" ","", regex=False)
    dfgc['PhaseG']='G'  # is it in the gas-phase? 
    dfgc=dfgc.drop(['GC2AESURR','GC2AQSURR','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP'],axis=1) 
    dfgc['Species']=dfgc.Species.str.replace('VROC','ROC', regex=False) # drop for matching with AE

    ###########################################
    # Prep NR
    nrfile = 'NR_'+mech+'.nml'
    filename = os.path.join( inputfiledir, nrfile)
    dfnr = pd.read_csv(filename,skiprows=4)
    nrowdim=len(dfnr)
    dfnr=dfnr.drop([nrowdim-1]) #drop last row
    dfnr.columns=dfnr.columns.str.replace(' ','', regex=False)
    dfnr.rename(columns={"!SPECIES":"Species"}, inplace=True)
    dfnr['Species']=dfnr.Species.str.replace("'","", regex=False)
    dfnr['Species']=dfnr.Species.str.replace(" ","", regex=False)
    dfnr['PhaseG']='G'
    dfnr=dfnr.drop(['NR2AESURR','NR2AQSURR','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP'],axis=1)  #these won't match other nml
    # Append NR to GC
    dfgc=pd.concat([dfgc, dfnr],ignore_index=True)
    dfgc['WET-SCAVSURR']=dfgc['WET-SCAVSURR'].str.replace("'","", regex=False)
    dfgc['WET-SCAVSURR']=dfgc['WET-SCAVSURR'].str.replace(" ","", regex=False)

    ###########################################
    #https://www.dataquest.io/wp-content/uploads/2019/03/python-regular-expressions-cheat-sheet.pdf
    # Prep hlconst, dissolution enthalpy for WET-SCAVSURR
    hlfile = 'hlconst.F'
    filename = os.path.join( inputfiledir, hlfile)
    column_names = ['hspecies','henryMatm','henryenthalpyK']
    dfhenry = pd.DataFrame(columns=column_names)
    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe
    filetoread = open(filename)
    for line in filetoread:
        line = line.rstrip()
        if re.search('^      DATA SUBNAME\(',line):
            hspecies=(re.findall('\)\s*/\s*\'(.*)\'.*!', line)[0]) # return name
            hlvalue=float(re.findall('DATA SUBNAME.*\/.*,(.*),.*\/.*!',line)[0]) # get the item between the first 2 commas between the slashes
            enthalpyK=float(re.findall('DATA SUBNAME.*\/.*,.*,(.*).*\/.*!',line)[0])
            newrow = pd.Series(data={'hspecies':hspecies,'henryMatm':hlvalue,
                                  'henryenthalpyK':enthalpyK})
            dfhenry = pd.concat([dfhenry, newrow.to_frame().T],ignore_index=True)
    dfhenry.hspecies=dfhenry.hspecies.str.replace(" ","", regex=False)
    dfgc=pd.merge(dfgc,dfhenry,left_on="WET-SCAVSURR",right_on="hspecies",how="left")

    ###########################################
    # Prep AE
    aefile = 'AE_'+mech+'.nml'
    filename = os.path.join( inputfiledir, aefile )
    dfae = pd.read_csv(filename,skiprows=4)
    nrowdim=len(dfae)
    dfae=dfae.drop([nrowdim-1]) #drop last row
    dfae.columns=dfae.columns.str.replace(' ','', regex=False) # get rid of spaces in column names
    dfae.rename(columns={"!SPECIES":"Species"}, inplace=True) # rename this heading
    dfae['Species']=dfae.Species.str.replace("'","", regex=False)  # get rid of ' in species names
    dfae['Species']=dfae.Species.str.replace(" ","", regex=False)  # get rid of spaces in species names
    dfae['PhaseP']='P' # particle phase
    dfae=dfae.drop(['AE2AQSURR','FAC.1','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP','OPTICS','DRYDEPSURR','WET-SCAVSURR'],axis=1)  #these won't match other nml

    ###########################################
    # Prep AERO_DATA and get density, kappa
    adfile = 'AERO_DATA.F'
    filename = os.path.join( inputfiledir, adfile)
    column_names = ['adspecies','aerodensity','aerokappa']
    dfad = pd.DataFrame(columns=column_names)
    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe
    filetoread = open(filename)
    for line in filetoread:
        line = line.rstrip()
        if re.search('^     & spcs_list_type\(',line):
            # one comment has () which is problematic, drop
            line=str.replace(line, '(Black)','Black')
            adspecies=(re.findall('^     & spcs_list_type\(\'(.*)\',.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\)', line)[0]) # return name
            aerodensity=float(re.findall('^     & spcs_list_type\(.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\)', line)[0]) 
            aerokappa=float(re.findall('^     & spcs_list_type\(.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,(.*)\)', line)[0]) 
            newrow = pd.Series(data={'adspecies':adspecies,'aerodensity':aerodensity,
                                  'aerokappa':aerokappa})
            dfad = pd.concat([dfad, newrow.to_frame().T],ignore_index=True)
    dfad.adspecies=dfad.adspecies.str.replace(" ","", regex=False)
    dfae=pd.merge(dfae,dfad,left_on="Species",right_on="adspecies",how="left")

    ###########################################
    # Prep SOA_DEFN
    oafile = 'SOA_DEFN.F'
    filename = os.path.join( inputfiledir, oafile)
    column_names = ['oaspecies','oacstar','oaenthalpy','oaotoc','oaomoc']
    dfoa = pd.DataFrame(columns=column_names)
    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe
    filetoread = open(filename)
    for line in filetoread:
        line = line.rstrip()
        if re.search('^     & oa_type\(',line):
            oaspecies=(      re.findall('^     & oa_type\(\'(.*)\',.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\)', line)[0]) # return name
            oacstar=float(   re.findall('^     & oa_type\(.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*,.*,.*\)', line)[0]) 
            oaenthalpy=float(re.findall('^     & oa_type\(.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*,.*\)', line)[0]) 
            oaotoc=float(    re.findall('^     & oa_type\(.*,.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*,.*\)', line)[0]) 
            oaomoc=float(  re.findall('^     & oa_type\(.*,.*,.*,.*,.*,.*,.*,\s*(.*),.*,.*,.*,.*\)', line)[0]) 
            newrow = pd.Series(data={'oaspecies':oaspecies,'oacstar':oacstar,
                                  'oaenthalpy':oaenthalpy,'oaotoc':oaotoc,
                                   'oaomoc':oaomoc})
            #print(newrow)
            dfoa = pd.concat([dfoa, newrow.to_frame().T],ignore_index=True)
    dfoa.oaspecies=dfoa.oaspecies.str.replace(" ","", regex=False)
    dfae=pd.merge(dfae,dfoa,left_on="Species",right_on="oaspecies",how="left")

    # Finish formatting ae.nml info
    #dfae['Species']=dfae['Species'].str.strip().str[0:-1] # remove trailing I,J,K, needed for CMAQ v5.3 but not v5.4
    dfae['Species']=dfae.Species.str.replace('AROC','ROC', regex=False) # match these with gas
    dfae['Species']=dfae.Species.str.replace('AHOM','HOM', regex=False)  # match with gas
    dfae['Species']=dfae.Species.str.replace('AELHOM','ELHOM', regex=False) # match with gas
    dfae['Species']=dfae.Species.str.replace('AOP3','OP3', regex=False) # match with gas
    dfae['Species']=dfae.Species.str.replace('ATRPN','TRPN', regex=False) # match with gas
    dfae['Species']=dfae.Species.str.replace('AHONIT','HONIT', regex=False) # match with gas

    ###########################################
    # merge and add g (gas) or p (particle) suffix and do molec wt check
    dfgc=pd.merge(dfgc,dfae,on="Species",how="outer",suffixes=("_g","_p"))
    dfgc['chckmw']=dfgc['MOLWT_g']-dfgc['MOLWT_p'] # gas and particle molecular weights should match
    if len(dfgc[dfgc['chckmw']>0])>0:
        print(">>gas and particle molecular weights have an inconsistency<<")
        print(dfgc[dfgc['chckmw']>0])
    else:
        print(">>gas and particle molecular weights match<<")

    ###########################################
    # bring in descriptions
    filename = os.path.join( inputfiledir, mech+'_speciesdescription.csv')
    dfdesc = pd.read_csv(filename)
    dfdesc.columns=dfdesc.columns.str.replace(' ','', regex=False)
    dfdesc['Species']=dfdesc.Species.str.replace(' ','', regex=False)
    # need to remove spaces from species names
    dfgc= pd.merge(dfgc,dfdesc,left_on='Species',right_on='Species',how="left")
    # warning if no matching species description
    if dfgc[dfgc['Description'].isna()].size>0:
        for spc in dfgc[dfgc['Description'].isna()]['Species']:
            print(f'Warning: {spc} species description is missing')
        print(f'Check {mech}_speciesdescription.csv for missing species descriptions')

    ###########################################
    # Organize data sort alphabetical, take GC.nml value first
    dfgc = dfgc.sort_values("Species") # sort alphabetical
    dfgc["Phase"]=dfgc["PhaseG"].fillna('')+dfgc["PhaseP"].fillna('')
    dfgc['Molecular Weight (g/mol)']=dfgc['MOLWT_g'].fillna(dfgc['MOLWT_p'])
    dfgc['Explicit/Lumped']=dfgc['ExplicitorLumped_g'].fillna(dfgc['ExplicitorLumped_p'])
    dfgc['Representative']=dfgc['!RepCmp_g'].fillna(dfgc['!RepCmp_p']) 
    dfgc['Representative']=dfgc.Representative.str.replace("!","", regex=False)
    dfgc['DTXSID']=dfgc['DTXSID_g'].fillna(dfgc['DTXSID_p'])
    dfgc['DTXSID']=dfgc['DTXSID'].fillna('') 
    dfgc['DTXSID']=dfgc['DTXSID'].str.replace(' ','', regex=False) 
    dfgc['SMILES']=dfgc['SMILES_g'].fillna(dfgc['SMILES_p']) 
    dfgc['SMILES']=dfgc['SMILES'].str.replace(' ','', regex=False) 
    # Diagnose stable species based on them being transported in gas or aerosol
    dfgc['St']=dfgc['TRNS_g'].fillna('')+dfgc['TRNS_p'].fillna('')
    dfgc['St']=dfgc['St'].str.find('Yes')
    dfgc.loc[dfgc['St']>0,'Stable']='Yes'
    dfgc.loc[dfgc['St']<0,'Stable']='No'

    return dfgc

Prepare metadata for mechanism

Warnings will print if molecular weights differ across gas and particle phases. Paired gas-particle species are identified by a prepended A and V.

mech='cracmm2'
dfgc=prep_metadata(mech)

>>gas and particle molecular weights match<<

Save to Markdown File

###########################################
# Write out Markdown for CMAQ GitHub
###########################################
dfmarkdown = dfgc[['Species','Description','Phase','Molecular Weight (g/mol)','Explicit/Lumped','Representative','DTXSID','SMILES']].copy()
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('NA','', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('[','\[', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace(']','\]', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('(','\(', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace(')','\)', regex=False)
dfmarkdown['SMILES']=dfmarkdown['SMILES'].fillna('')

# Hyperlink SMILES to DTXSID entry in dashboard
dfmarkdown['SMILESfmt']= '['+ dfmarkdown.SMILES + '](https://comptox.epa.gov/dashboard/chemical/details/'+ dfmarkdown.DTXSID + ')' 
maskval = dfmarkdown.DTXSID.str.len()>5
dfmarkdown.loc[maskval,'SMILES']=dfmarkdown.loc[maskval,'SMILESfmt']
dfmarkdown=dfmarkdown.drop(['DTXSID'],axis=1)
dfmarkdown=dfmarkdown.drop(['SMILESfmt'],axis=1)

# assemble and format table header
headerline = ' <sub>Species</sub> | <sub>Description</sub> | <sub>Phase</sub> | <sub>Molecular Weight (g/mol)</sub> | <sub>Explicit/ Lumped</sub> | <sub>Representative Structure</sub> | <sub>SMILES</sub> '
firstmarkdownline = "Gas (G) and particle (P) species from the namelists. SMILES link to representative structures in the EPA Chemicals Dashboard (if available)."
secondmarkdownline = "Note that for each particulate species in CMAQ, a letter will be appended to the name to designate the size, or mode, of the aerosol being represented: I = Aitken mode, J = Accumulation mode, K = Coarse mode. Prepending of a species with a V or A in CMAQ or the chemical mechanism files indicates the species resides in the gas or particulate phase. "
dfmarkdown['Representative']=dfgc.Representative.str.replace(";",",", regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace(';',',', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('ug/m3','&#956;g m<sup>-3</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('log10C','log<sub>10</sub>C', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('kOH','k<sub>OH</sub>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('cm3','cm<sup>3</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('s-1','s<sup>-1</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-10','10<sup>-10</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-11','10<sup>-11</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-12','10<sup>-12</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-13','10<sup>-13</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-14','10<sup>-14</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-2','10<sup>-2</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-1','10<sup>-1</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+1','10<sup>+1</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+2','10<sup>+2</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+3','10<sup>+3</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+4','10<sup>+4</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+5','10<sup>+5</sup>', regex=False)
dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+6','10<sup>+6</sup>', regex=False)

mdfile = mech+'_species_table.md'
filename = os.path.join( outputfiledir, mdfile)
mdfile= open(filename,'w')
mdfile.write(mech.upper() + ' Species Table')
mdfile.write('\n')
mdfile.write(firstmarkdownline)
mdfile.write('\n')
mdfile.write('\n')
mdfile.write(secondmarkdownline)
mdfile.write('\n')
mdfile.write('\n')
mdfile.write(headerline)
mdfile.write('\n')
mdfile.write(' ----- | ----- | ----- | ----- | ----- | ----- | ----- ')
mdfile.write('\n')
mdfile.close()
dfmarkdown.to_csv(filename,index=False,header=False,sep='|',mode='a')
mdfile= open(filename,'a')
mdfile.write('\n')
mdfile.close()

Save to csv file

############# Metadata CSV
dfmetadata=dfgc[['Species','Description','Phase','Stable','Molecular Weight (g/mol)',
                 'Explicit/Lumped','Representative','SMILES','DTXSID','henryMatm',
                 'henryenthalpyK','aerodensity','aerokappa','oacstar','oaenthalpy',
                 'oaomoc']].copy()
dfmetadata['aerokappa']=dfmetadata.aerokappa.mask(dfmetadata.aerokappa <= 0, 'NA')
dfmetadata['DTXSID']=dfmetadata.DTXSID.mask(dfmetadata.DTXSID == '', 'NA' )
dfmetadata=dfmetadata.rename(columns={'henryMatm':'H Law (M/atm)'})
dfmetadata=dfmetadata.rename(columns={'henryenthalpyK':'Enthalpy of solution (K)'})
dfmetadata=dfmetadata.rename(columns={'aerodensity':'Aerosol density (kg/m3)'})
dfmetadata=dfmetadata.rename(columns={'aerokappa':'Kappa_org'})
dfmetadata=dfmetadata.rename(columns={'oacstar':'C* (microg/m3)'})
dfmetadata=dfmetadata.rename(columns={'oaenthalpy':'Enthalpy of vaporization (J/mol)'})
dfmetadata=dfmetadata.rename(columns={'oaomoc':'OM to OC (g/g)'})

dfmetadata=dfmetadata.fillna('NA')

metafile = mech+'_metadata.csv'
filename = os.path.join( outputfiledir, metafile)
dfmetadata.to_csv(filename,index=False,header=True,sep=',',mode='w')