{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ac3c4f7b-3c6b-4698-9979-501b102b1296",
   "metadata": {},
   "source": [
    "# Assemble CRACMM species metadata from CMAQ \n",
    "---\n",
    "    author: Havala O.T. Pye (pye.havala@epa.gov)\n",
    "    date: created 2022-09-12\n",
    "\n",
    "    updated: Nash Skipper\n",
    "    date: 2024-02-09\n",
    "\n",
    "    updated: Nash Skipper\n",
    "    date: 2024-03-28\n",
    "    \n",
    "    updated: Havala Pye\n",
    "    date: 2025-02-18\n",
    "\n",
    "    updated: Michael Pye\n",
    "    date: 2025-02-27 \n",
    "---\n",
    "## Notebook Description\n",
    "This notebook collects data across the CMAQ model to create a table of species information in both csv and markdown formats. The csv version contains additional data not easily displayed in markdown. Output from this notebook is stored [here](https://github.com/USEPA/CRACMM/tree/main/metadata). After clicking the link, select the directory that corresponds to the chemical mechanism of your choice to find the correct output files. \n",
    "\n",
    "## Download Notebook\n",
    "Click [here](https://github.com/USEPA/CRACMM/blob/main/utilities/markdown_metadata.ipynb) to download this tutorial as a Jupyter Notebook file.  \n",
    "\n",
    "## CMAQ input files\n",
    "- AE_{mech}.nml\n",
    "- GC_{mech}.nml \n",
    "- NR_{mech}.nml \n",
    "- AERO_DATA.F \n",
    "- SOA_DEFN.F \n",
    "- hlconst.F \n",
    "- {mech}_speciesdescription.csv\n",
    "\n",
    "## Mechanisms supported \n",
    "- cracmm1_aq \n",
    "- cracmm1amore_aq \n",
    "- cracmm2 "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0a836b6-4358-4206-b8e6-d22a01f94479",
   "metadata": {},
   "source": [
    "## Setup libraries, paths, and function to prepare metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d4df66fa-7d2a-4b29-8e72-c3f63ee126a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5026cc63-a3e5-4fe7-8f1d-3825a2161a5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set current working directory where this file resides \n",
    "# This code expects *.nml and files from CMAQ to be present in ./input and will output in ./output\n",
    "outputfiledir = os.path.join(os.getcwd(), 'output')\n",
    "workdir = '/work/MOD3DEV/has/2023cracmm_ages/structurecuration/'\n",
    "filepath = os.path.normpath(workdir)\n",
    "os.chdir(filepath)\n",
    "inputfiledir = os.path.join(os.getcwd(), 'input')\n",
    "if not os.path.isdir('./output'):\n",
    "    os.mkdir('./output')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c81725dc-ee1a-4860-a9ef-33bb8c823140",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to prepare dataframe with metadata\n",
    "def prep_metadata(mech):\n",
    "    \n",
    "    # uses CMAQ files: AE.nml, GC.nml, NR.nml. AERO_DATA.F, SOA_DEFN.F, hlconst.F, {mech}_speciesdescription.csv\n",
    "    # mech input string should be cracmm1_aq or cracmm1amore_aq for CMAQv5.4\n",
    "    #                             cracmm1_aq, cracmm1amore_aq, or cracmm2 for CMAQv5.5\n",
    "\n",
    "    ###########################################\n",
    "    # Prep Gases\n",
    "    gcfile = 'GC_'+mech+'.nml'\n",
    "    filename = os.path.join( inputfiledir, gcfile)\n",
    "    dfgc = pd.read_csv(filename,skiprows=4)\n",
    "    nrowdim=len(dfgc)\n",
    "    dfgc=dfgc.drop([nrowdim-1]) #drop last row\n",
    "    dfgc.columns=dfgc.columns.str.replace(' ','', regex=False)\n",
    "    dfgc.rename(columns={\"!SPECIES\":\"Species\"}, inplace=True)\n",
    "    dfgc['Species']=dfgc.Species.str.replace(\"'\",\"\", regex=False)\n",
    "    dfgc['Species']=dfgc.Species.str.replace(\" \",\"\", regex=False)\n",
    "    dfgc['PhaseG']='G'  # is it in the gas-phase? \n",
    "    dfgc=dfgc.drop(['GC2AESURR','GC2AQSURR','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP'],axis=1) \n",
    "    dfgc['Species']=dfgc.Species.str.replace('VROC','ROC', regex=False) # drop for matching with AE\n",
    "\n",
    "    ###########################################\n",
    "    # Prep NR\n",
    "    nrfile = 'NR_'+mech+'.nml'\n",
    "    filename = os.path.join( inputfiledir, nrfile)\n",
    "    dfnr = pd.read_csv(filename,skiprows=4)\n",
    "    nrowdim=len(dfnr)\n",
    "    dfnr=dfnr.drop([nrowdim-1]) #drop last row\n",
    "    dfnr.columns=dfnr.columns.str.replace(' ','', regex=False)\n",
    "    dfnr.rename(columns={\"!SPECIES\":\"Species\"}, inplace=True)\n",
    "    dfnr['Species']=dfnr.Species.str.replace(\"'\",\"\", regex=False)\n",
    "    dfnr['Species']=dfnr.Species.str.replace(\" \",\"\", regex=False)\n",
    "    dfnr['PhaseG']='G'\n",
    "    dfnr=dfnr.drop(['NR2AESURR','NR2AQSURR','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP'],axis=1)  #these won't match other nml\n",
    "    # Append NR to GC\n",
    "    dfgc=pd.concat([dfgc, dfnr],ignore_index=True)\n",
    "    dfgc['WET-SCAVSURR']=dfgc['WET-SCAVSURR'].str.replace(\"'\",\"\", regex=False)\n",
    "    dfgc['WET-SCAVSURR']=dfgc['WET-SCAVSURR'].str.replace(\" \",\"\", regex=False)\n",
    "\n",
    "    ###########################################\n",
    "    #https://www.dataquest.io/wp-content/uploads/2019/03/python-regular-expressions-cheat-sheet.pdf\n",
    "    # Prep hlconst, dissolution enthalpy for WET-SCAVSURR\n",
    "    hlfile = 'hlconst.F'\n",
    "    filename = os.path.join( inputfiledir, hlfile)\n",
    "    column_names = ['hspecies','henryMatm','henryenthalpyK']\n",
    "    dfhenry = pd.DataFrame(columns=column_names)\n",
    "    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe\n",
    "    filetoread = open(filename)\n",
    "    for line in filetoread:\n",
    "        line = line.rstrip()\n",
    "        if re.search('^      DATA SUBNAME\\(',line):\n",
    "            hspecies=(re.findall('\\)\\s*/\\s*\\'(.*)\\'.*!', line)[0]) # return name\n",
    "            hlvalue=float(re.findall('DATA SUBNAME.*\\/.*,(.*),.*\\/.*!',line)[0]) # get the item between the first 2 commas between the slashes\n",
    "            enthalpyK=float(re.findall('DATA SUBNAME.*\\/.*,.*,(.*).*\\/.*!',line)[0])\n",
    "            newrow = pd.Series(data={'hspecies':hspecies,'henryMatm':hlvalue,\n",
    "                                  'henryenthalpyK':enthalpyK})\n",
    "            dfhenry = pd.concat([dfhenry, newrow.to_frame().T],ignore_index=True)\n",
    "    dfhenry.hspecies=dfhenry.hspecies.str.replace(\" \",\"\", regex=False)\n",
    "    dfgc=pd.merge(dfgc,dfhenry,left_on=\"WET-SCAVSURR\",right_on=\"hspecies\",how=\"left\")\n",
    "\n",
    "    ###########################################\n",
    "    # Prep AE\n",
    "    aefile = 'AE_'+mech+'.nml'\n",
    "    filename = os.path.join( inputfiledir, aefile )\n",
    "    dfae = pd.read_csv(filename,skiprows=4)\n",
    "    nrowdim=len(dfae)\n",
    "    dfae=dfae.drop([nrowdim-1]) #drop last row\n",
    "    dfae.columns=dfae.columns.str.replace(' ','', regex=False) # get rid of spaces in column names\n",
    "    dfae.rename(columns={\"!SPECIES\":\"Species\"}, inplace=True) # rename this heading\n",
    "    dfae['Species']=dfae.Species.str.replace(\"'\",\"\", regex=False)  # get rid of ' in species names\n",
    "    dfae['Species']=dfae.Species.str.replace(\" \",\"\", regex=False)  # get rid of spaces in species names\n",
    "    dfae['PhaseP']='P' # particle phase\n",
    "    dfae=dfae.drop(['AE2AQSURR','FAC.1','IC','IC_FAC','BC','BC_FAC','FAC','CONC','WDEP','DDEP','OPTICS','DRYDEPSURR','WET-SCAVSURR'],axis=1)  #these won't match other nml\n",
    "\n",
    "    ###########################################\n",
    "    # Prep AERO_DATA and get density, kappa\n",
    "    adfile = 'AERO_DATA.F'\n",
    "    filename = os.path.join( inputfiledir, adfile)\n",
    "    column_names = ['adspecies','aerodensity','aerokappa']\n",
    "    dfad = pd.DataFrame(columns=column_names)\n",
    "    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe\n",
    "    filetoread = open(filename)\n",
    "    for line in filetoread:\n",
    "        line = line.rstrip()\n",
    "        if re.search('^     & spcs_list_type\\(',line):\n",
    "            # one comment has () which is problematic, drop\n",
    "            line=str.replace(line, '(Black)','Black')\n",
    "            adspecies=(re.findall('^     & spcs_list_type\\(\\'(.*)\\',.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\\)', line)[0]) # return name\n",
    "            aerodensity=float(re.findall('^     & spcs_list_type\\(.*,.*,.*,.*,.*,\\s*(.*),.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\\)', line)[0]) \n",
    "            aerokappa=float(re.findall('^     & spcs_list_type\\(.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,(.*)\\)', line)[0]) \n",
    "            newrow = pd.Series(data={'adspecies':adspecies,'aerodensity':aerodensity,\n",
    "                                  'aerokappa':aerokappa})\n",
    "            dfad = pd.concat([dfad, newrow.to_frame().T],ignore_index=True)\n",
    "    dfad.adspecies=dfad.adspecies.str.replace(\" \",\"\", regex=False)\n",
    "    dfae=pd.merge(dfae,dfad,left_on=\"Species\",right_on=\"adspecies\",how=\"left\")\n",
    "\n",
    "    ###########################################\n",
    "    # Prep SOA_DEFN\n",
    "    oafile = 'SOA_DEFN.F'\n",
    "    filename = os.path.join( inputfiledir, oafile)\n",
    "    column_names = ['oaspecies','oacstar','oaenthalpy','oaotoc','oaomoc']\n",
    "    dfoa = pd.DataFrame(columns=column_names)\n",
    "    # read lines that start with DATA SUBNAME ('^       DATA SUBNAME') and parse Hlconst, save to dataframe\n",
    "    filetoread = open(filename)\n",
    "    for line in filetoread:\n",
    "        line = line.rstrip()\n",
    "        if re.search('^     & oa_type\\(',line):\n",
    "            oaspecies=(      re.findall('^     & oa_type\\(\\'(.*)\\',.*,.*,.*,.*,.*,.*,.*,.*,.*,.*,.*\\)', line)[0]) # return name\n",
    "            oacstar=float(   re.findall('^     & oa_type\\(.*,.*,.*,.*,\\s*(.*),.*,.*,.*,.*,.*,.*,.*\\)', line)[0]) \n",
    "            oaenthalpy=float(re.findall('^     & oa_type\\(.*,.*,.*,.*,.*,\\s*(.*),.*,.*,.*,.*,.*,.*\\)', line)[0]) \n",
    "            oaotoc=float(    re.findall('^     & oa_type\\(.*,.*,.*,.*,.*,.*,\\s*(.*),.*,.*,.*,.*,.*\\)', line)[0]) \n",
    "            oaomoc=float(  re.findall('^     & oa_type\\(.*,.*,.*,.*,.*,.*,.*,\\s*(.*),.*,.*,.*,.*\\)', line)[0]) \n",
    "            newrow = pd.Series(data={'oaspecies':oaspecies,'oacstar':oacstar,\n",
    "                                  'oaenthalpy':oaenthalpy,'oaotoc':oaotoc,\n",
    "                                   'oaomoc':oaomoc})\n",
    "            #print(newrow)\n",
    "            dfoa = pd.concat([dfoa, newrow.to_frame().T],ignore_index=True)\n",
    "    dfoa.oaspecies=dfoa.oaspecies.str.replace(\" \",\"\", regex=False)\n",
    "    dfae=pd.merge(dfae,dfoa,left_on=\"Species\",right_on=\"oaspecies\",how=\"left\")\n",
    "\n",
    "    # Finish formatting ae.nml info\n",
    "    #dfae['Species']=dfae['Species'].str.strip().str[0:-1] # remove trailing I,J,K, needed for CMAQ v5.3 but not v5.4\n",
    "    dfae['Species']=dfae.Species.str.replace('AROC','ROC', regex=False) # match these with gas\n",
    "    dfae['Species']=dfae.Species.str.replace('AHOM','HOM', regex=False)  # match with gas\n",
    "    dfae['Species']=dfae.Species.str.replace('AELHOM','ELHOM', regex=False) # match with gas\n",
    "    dfae['Species']=dfae.Species.str.replace('AOP3','OP3', regex=False) # match with gas\n",
    "    dfae['Species']=dfae.Species.str.replace('ATRPN','TRPN', regex=False) # match with gas\n",
    "    dfae['Species']=dfae.Species.str.replace('AHONIT','HONIT', regex=False) # match with gas\n",
    "\n",
    "    ###########################################\n",
    "    # merge and add g (gas) or p (particle) suffix and do molec wt check\n",
    "    dfgc=pd.merge(dfgc,dfae,on=\"Species\",how=\"outer\",suffixes=(\"_g\",\"_p\"))\n",
    "    dfgc['chckmw']=dfgc['MOLWT_g']-dfgc['MOLWT_p'] # gas and particle molecular weights should match\n",
    "    if len(dfgc[dfgc['chckmw']>0])>0:\n",
    "        print(\">>gas and particle molecular weights have an inconsistency<<\")\n",
    "        print(dfgc[dfgc['chckmw']>0])\n",
    "    else:\n",
    "        print(\">>gas and particle molecular weights match<<\")\n",
    "\n",
    "    ###########################################\n",
    "    # bring in descriptions\n",
    "    filename = os.path.join( inputfiledir, mech+'_speciesdescription.csv')\n",
    "    dfdesc = pd.read_csv(filename)\n",
    "    dfdesc.columns=dfdesc.columns.str.replace(' ','', regex=False)\n",
    "    dfdesc['Species']=dfdesc.Species.str.replace(' ','', regex=False)\n",
    "    # need to remove spaces from species names\n",
    "    dfgc= pd.merge(dfgc,dfdesc,left_on='Species',right_on='Species',how=\"left\")\n",
    "    # warning if no matching species description\n",
    "    if dfgc[dfgc['Description'].isna()].size>0:\n",
    "        for spc in dfgc[dfgc['Description'].isna()]['Species']:\n",
    "            print(f'Warning: {spc} species description is missing')\n",
    "        print(f'Check {mech}_speciesdescription.csv for missing species descriptions')\n",
    "\n",
    "    ###########################################\n",
    "    # Organize data sort alphabetical, take GC.nml value first\n",
    "    dfgc = dfgc.sort_values(\"Species\") # sort alphabetical\n",
    "    dfgc[\"Phase\"]=dfgc[\"PhaseG\"].fillna('')+dfgc[\"PhaseP\"].fillna('')\n",
    "    dfgc['Molecular Weight (g/mol)']=dfgc['MOLWT_g'].fillna(dfgc['MOLWT_p'])\n",
    "    dfgc['Explicit/Lumped']=dfgc['ExplicitorLumped_g'].fillna(dfgc['ExplicitorLumped_p'])\n",
    "    dfgc['Representative']=dfgc['!RepCmp_g'].fillna(dfgc['!RepCmp_p']) \n",
    "    dfgc['Representative']=dfgc.Representative.str.replace(\"!\",\"\", regex=False)\n",
    "    dfgc['DTXSID']=dfgc['DTXSID_g'].fillna(dfgc['DTXSID_p'])\n",
    "    dfgc['DTXSID']=dfgc['DTXSID'].fillna('') \n",
    "    dfgc['DTXSID']=dfgc['DTXSID'].str.replace(' ','', regex=False) \n",
    "    dfgc['SMILES']=dfgc['SMILES_g'].fillna(dfgc['SMILES_p']) \n",
    "    dfgc['SMILES']=dfgc['SMILES'].str.replace(' ','', regex=False) \n",
    "    # Diagnose stable species based on them being transported in gas or aerosol\n",
    "    dfgc['St']=dfgc['TRNS_g'].fillna('')+dfgc['TRNS_p'].fillna('')\n",
    "    dfgc['St']=dfgc['St'].str.find('Yes')\n",
    "    dfgc.loc[dfgc['St']>0,'Stable']='Yes'\n",
    "    dfgc.loc[dfgc['St']<0,'Stable']='No'\n",
    "\n",
    "    return dfgc"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35153f82-117f-470c-981d-af9d7ec9537b",
   "metadata": {},
   "source": [
    "## Prepare metadata for mechanism\n",
    "Warnings will print if molecular weights differ across gas and particle phases. Paired gas-particle species are identified by a prepended A and V."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c64f1c3e-0c14-49c9-8072-86921d80868e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>gas and particle molecular weights match<<\n"
     ]
    }
   ],
   "source": [
    "mech='cracmm2'\n",
    "dfgc=prep_metadata(mech)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45a28fcc-70a0-452a-94b7-3988f17f03f9",
   "metadata": {},
   "source": [
    "### Save to Markdown File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74b4fb68-64b8-46f4-8c58-cc1a55555ac1",
   "metadata": {},
   "outputs": [],
   "source": [
    "###########################################\n",
    "# Write out Markdown for CMAQ GitHub\n",
    "###########################################\n",
    "dfmarkdown = dfgc[['Species','Description','Phase','Molecular Weight (g/mol)','Explicit/Lumped','Representative','DTXSID','SMILES']].copy()\n",
    "dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('NA','', regex=False)\n",
    "dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('[','\\[', regex=False)\n",
    "dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace(']','\\]', regex=False)\n",
    "dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace('(','\\(', regex=False)\n",
    "dfmarkdown['SMILES']=dfmarkdown['SMILES'].str.replace(')','\\)', regex=False)\n",
    "dfmarkdown['SMILES']=dfmarkdown['SMILES'].fillna('')\n",
    "\n",
    "# Hyperlink SMILES to DTXSID entry in dashboard\n",
    "dfmarkdown['SMILESfmt']= '['+ dfmarkdown.SMILES + '](https://comptox.epa.gov/dashboard/chemical/details/'+ dfmarkdown.DTXSID + ')' \n",
    "maskval = dfmarkdown.DTXSID.str.len()>5\n",
    "dfmarkdown.loc[maskval,'SMILES']=dfmarkdown.loc[maskval,'SMILESfmt']\n",
    "dfmarkdown=dfmarkdown.drop(['DTXSID'],axis=1)\n",
    "dfmarkdown=dfmarkdown.drop(['SMILESfmt'],axis=1)\n",
    "\n",
    "# assemble and format table header\n",
    "headerline = ' <sub>Species</sub> | <sub>Description</sub> | <sub>Phase</sub> | <sub>Molecular Weight (g/mol)</sub> | <sub>Explicit/ Lumped</sub> | <sub>Representative Structure</sub> | <sub>SMILES</sub> '\n",
    "firstmarkdownline = \"Gas (G) and particle (P) species from the namelists. SMILES link to representative structures in the EPA Chemicals Dashboard (if available).\"\n",
    "secondmarkdownline = \"Note that for each particulate species in CMAQ, a letter will be appended to the name to designate the size, or mode, of the aerosol being represented: I = Aitken mode, J = Accumulation mode, K = Coarse mode. Prepending of a species with a V or A in CMAQ or the chemical mechanism files indicates the species resides in the gas or particulate phase. \"\n",
    "dfmarkdown['Representative']=dfgc.Representative.str.replace(\";\",\",\", regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace(';',',', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('ug/m3','&#956;g m<sup>-3</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('log10C','log<sub>10</sub>C', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('kOH','k<sub>OH</sub>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('cm3','cm<sup>3</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('s-1','s<sup>-1</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-10','10<sup>-10</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-11','10<sup>-11</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-12','10<sup>-12</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-13','10<sup>-13</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-14','10<sup>-14</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-2','10<sup>-2</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10-1','10<sup>-1</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+1','10<sup>+1</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+2','10<sup>+2</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+3','10<sup>+3</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+4','10<sup>+4</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+5','10<sup>+5</sup>', regex=False)\n",
    "dfmarkdown['Description']=dfmarkdown.Description.str.replace('10+6','10<sup>+6</sup>', regex=False)\n",
    "\n",
    "mdfile = mech+'_species_table.md'\n",
    "filename = os.path.join( outputfiledir, mdfile)\n",
    "mdfile= open(filename,'w')\n",
    "mdfile.write(mech.upper() + ' Species Table')\n",
    "mdfile.write('\\n')\n",
    "mdfile.write(firstmarkdownline)\n",
    "mdfile.write('\\n')\n",
    "mdfile.write('\\n')\n",
    "mdfile.write(secondmarkdownline)\n",
    "mdfile.write('\\n')\n",
    "mdfile.write('\\n')\n",
    "mdfile.write(headerline)\n",
    "mdfile.write('\\n')\n",
    "mdfile.write(' ----- | ----- | ----- | ----- | ----- | ----- | ----- ')\n",
    "mdfile.write('\\n')\n",
    "mdfile.close()\n",
    "dfmarkdown.to_csv(filename,index=False,header=False,sep='|',mode='a')\n",
    "mdfile= open(filename,'a')\n",
    "mdfile.write('\\n')\n",
    "mdfile.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25879c81-3e52-44c2-814c-16cdc5e07ef3",
   "metadata": {},
   "source": [
    "### Save to csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9f05eddc-41a1-48af-97e7-4a401e835f69",
   "metadata": {},
   "outputs": [],
   "source": [
    "############# Metadata CSV\n",
    "dfmetadata=dfgc[['Species','Description','Phase','Stable','Molecular Weight (g/mol)',\n",
    "                 'Explicit/Lumped','Representative','SMILES','DTXSID','henryMatm',\n",
    "                 'henryenthalpyK','aerodensity','aerokappa','oacstar','oaenthalpy',\n",
    "                 'oaomoc']].copy()\n",
    "dfmetadata['aerokappa']=dfmetadata.aerokappa.mask(dfmetadata.aerokappa <= 0, 'NA')\n",
    "dfmetadata['DTXSID']=dfmetadata.DTXSID.mask(dfmetadata.DTXSID == '', 'NA' )\n",
    "dfmetadata=dfmetadata.rename(columns={'henryMatm':'H Law (M/atm)'})\n",
    "dfmetadata=dfmetadata.rename(columns={'henryenthalpyK':'Enthalpy of solution (K)'})\n",
    "dfmetadata=dfmetadata.rename(columns={'aerodensity':'Aerosol density (kg/m3)'})\n",
    "dfmetadata=dfmetadata.rename(columns={'aerokappa':'Kappa_org'})\n",
    "dfmetadata=dfmetadata.rename(columns={'oacstar':'C* (microg/m3)'})\n",
    "dfmetadata=dfmetadata.rename(columns={'oaenthalpy':'Enthalpy of vaporization (J/mol)'})\n",
    "dfmetadata=dfmetadata.rename(columns={'oaomoc':'OM to OC (g/g)'})\n",
    "\n",
    "dfmetadata=dfmetadata.fillna('NA')\n",
    "\n",
    "metafile = mech+'_metadata.csv'\n",
    "filename = os.path.join( outputfiledir, metafile)\n",
    "dfmetadata.to_csv(filename,index=False,header=True,sep=',',mode='w')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "rhel8_py39",
   "language": "python",
   "name": "rhel8_py39"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}