Source code for src.preprocess

#!/usr/bin/python

######################################
# Imports
######################################

# External
import hydra
from omegaconf import DictConfig
from os.path import join as join_path
import pandas as pd
from pathlib import Path

######################################
# Main
######################################


[docs] def write_species_csv( species_code: str, index_df: pd.DataFrame, data_dir: str, cols: str, outfile: str, lowercase_list=[], drop_na=False, ) -> None: """ Creates a species dataframe and writes it to a CSV file. Args: species_code (str): The shark species code. index_df (pd.DataFrame): The index dataframe containing metadata. data_dir (str): The data directory. cols (str): The dataframe column list. outfile (str): The output file name. lowercase_list (list, optional): Columns to convert all values to lowercase. drop_na (bool, optional): Drop missing values """ species_df = index_df.query("species_code == @species_code") extract_val = lambda key: species_df[key].values.item() species = extract_val("species") class_type = extract_val("class") order = extract_val("order") # Make dirs out_dir = join_path(data_dir, class_type, order, species) Path(out_dir).mkdir(parents=True, exist_ok=True) # Load data dataset_file = extract_val("dataset") data_path = join_path(data_dir, dataset_file) species_code = species_code.upper() data_df = pd.read_csv(data_path).query("Species == @species_code") # Preprocess data_df["age"] = data_df["AgeAgree"] data_df.columns = data_df.columns.str.lower() data_df = data_df[cols] if drop_na: data_df = data_df.dropna(axis=0) for lower_col in lowercase_list: data_df[lower_col] = data_df[lower_col].astype("str").str.lower() # Write data outfile = join_path(out_dir, outfile) data_df.to_csv(outfile, index=False) source = extract_val("source") outfile = join_path(out_dir, "README.md") with open(outfile, "w+") as f: f.writelines(source)
[docs] @hydra.main(version_base=None, config_path="../conf", config_name="config") def main(config: DictConfig) -> None: """ The main entry point for the preprocess pipeline. Args: config (DictConfig): The pipeline configuration. """ # Constants SPECIES_LIST = config["common"]["species"] data_config = config["data"] DATA_DIR = data_config["dir"] INDEX = data_config["index"] OUTFILE = data_config["out"] preprocess_config = config["preprocess"] COLS = preprocess_config["cols"] DROP_NA = preprocess_config["drop_na"] LOWERCASE = preprocess_config["lowercase"] # Load data index_file = join_path(DATA_DIR, INDEX) index_df = pd.read_csv(index_file) for species_code in SPECIES_LIST: write_species_csv( species_code, index_df, DATA_DIR, COLS, OUTFILE, LOWERCASE, DROP_NA )
if __name__ == "__main__": main()