Skip to content

Extract variables from files

el_paso.extract_variables_from_files.extract_variables_from_files

extract_variables_from_files

Extract variable data from files with any file format.

Parameters:

Name Type Description Default
start_time datetime

The start time for data extraction.

required
end_time datetime

The end time for data extraction.

required
file_cadence Literal['daily', 'monthly', 'single_file']

The cadence at which files are organized.

required
data_path Path or str

The directory path where data files are stored.

required
file_name_stem str

The stem of the file name to match files.

required
extraction_infos Iterable[ExtractionInfo]

Information about which variables to extract and how.

required
pd_read_csv_kwargs dict[str, Any]

Additional keyword arguments to pass to pandas.read_csv.

None
custom_extractors dict[str, Callable]

A dictionary mapping file suffixes to custom extractor functions.

None

Returns:

Type Description
dict[str, Variable]

dict[str, Variable]: A dictionary mapping result keys to extracted Variable objects.

Raises:

Type Description
ValueError

If no files are found for extraction.

Source code in el_paso/extract_variables_from_files.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def extract_variables_from_files(
    start_time: datetime,
    end_time: datetime,
    file_cadence: Literal["daily", "monthly", "single_file"],
    data_path: Path | str,
    file_name_stem: str,
    extraction_infos: Iterable[ExtractionInfo],
    pd_read_csv_kwargs: dict[str, Any] | None = None,
    custom_extractors: dict[str, Callable] | None = None,
) -> dict[str, Variable]:
    """Extract variable data from files with any file format.

    Args:
        start_time (datetime): The start time for data extraction.
        end_time (datetime): The end time for data extraction.
        file_cadence (Literal["daily", "monthly", "single_file"]): The cadence at which files are organized.
        data_path (Path or str): The directory path where data files are stored.
        file_name_stem (str): The stem of the file name to match files.
        extraction_infos (Iterable[ExtractionInfo]): Information about which variables to extract and how.
        pd_read_csv_kwargs (dict[str, Any], optional): Additional keyword arguments to pass to pandas.read_csv.
        custom_extractors (dict[str, Callable], optional): A dictionary mapping file suffixes to custom extractor functions.

    Returns:
        dict[str, Variable]: A dictionary mapping result keys to extracted Variable objects.

    Raises:
        ValueError: If no files are found for extraction.

    """  # noqa: E501
    logger.info("Extracting variables ...")

    if pd_read_csv_kwargs is None:
        pd_read_csv_kwargs = {}

    start_time = enforce_utc_timezone(start_time)
    end_time = enforce_utc_timezone(end_time)

    if start_time > end_time:
        msg = "start_time must be before end_time!"
        logger.error(msg)
        raise ValueError(msg)

    data_path = Path(data_path)

    files_list, _ = _construct_file_list(start_time, end_time, file_cadence, data_path / file_name_stem)

    if len(files_list) == 0:
        msg = f"No file found to extract variables! Search at: {data_path / file_name_stem}"
        logger.error(msg)
        raise ValueError(msg)

    variable_data = _extract_data_from_files(files_list, extraction_infos, pd_read_csv_kwargs, custom_extractors)

    # create variables based on the extraction_infos
    variables: dict[str, Variable] = {}

    for info in extraction_infos:
        if info.result_key is None:
            if isinstance(info.name_or_column, str):
                dict_key = info.name_or_column
            else:
                msg = "Result key cannot be inferred from a integer column! Please provide a result_key!"
                logger.error(msg)
                raise ValueError(msg)
        else:
            dict_key = info.result_key
        variables[dict_key] = Variable(original_unit=info.unit, data=variable_data[info.name_or_column])
        variables[dict_key].metadata.source_files = [path.name for path in files_list]

    return variables

el_paso.extract_variables_from_files.ExtractionInfo dataclass

Store metadata required to extract a variable from a source file.

Attributes:

Name Type Description
name_or_column str | int

Name of the variable or column to extract from the source file.

unit UnitBase

Physical unit associated with the extracted variable.

is_time_dependent bool

Whether the variable is time-dependent.

If True, data from multiple files will be concatenated along the time axis.

If False, data from multiple files will be used to fill missing (np.nan) values instead of being concatenated.

result_key str | None

Key to use for the extracted variable in the resulting variables dictionary.

If None, name_or_column is used as the key.

dependent_variables list[str] | None

Names of variables that the extracted variable depends on.

This is mainly used for JSON extraction to determine how extracted data should be reshaped.

np_dtype DTypeLike | None

Optional NumPy dtype used to cast the extracted data.

If None, the dtype is inferred from the source data.

Source code in el_paso/extract_variables_from_files.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@dataclass(frozen=True, slots=True, eq=False)
class ExtractionInfo:
    """Store metadata required to extract a variable from a source file.

    Attributes:
        name_or_column:
            Name of the variable or column to extract from the source file.

        unit:
            Physical unit associated with the extracted variable.

        is_time_dependent:
            Whether the variable is time-dependent.

            If ``True``, data from multiple files will be concatenated
            along the time axis.

            If ``False``, data from multiple files will be used to fill
            missing (`np.nan`) values instead of being concatenated.

        result_key:
            Key to use for the extracted variable in the resulting
            variables dictionary.

            If ``None``, ``name_or_column`` is used as the key.

        dependent_variables:
            Names of variables that the extracted variable depends on.

            This is mainly used for JSON extraction to determine how
            extracted data should be reshaped.

        np_dtype:
            Optional NumPy dtype used to cast the extracted data.

            If ``None``, the dtype is inferred from the source data.
    """

    name_or_column: str | int
    unit: u.UnitBase
    is_time_dependent: bool = True
    result_key: str | None = None
    dependent_variables: list[str] | None = None
    np_dtype: DTypeLike | None = None