Coverage for src/colorspace/datasets.py: 91%
69 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-29 15:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-29 15:11 +0000
3def _getdataset_volcano():
4 """Topographic Information on Auckland's Maunga Whau Volcano
6 Convenience function, for more details about this data set see
7 man page for function :py:func:`get_volcano_data <colorspace.demos.get_volcano_data>`.
8 """
9 from .demos import get_volcano_data
10 return get_volcano_data(array = True)
13def _getdataset_HarzTraffic():
14 """Daily Traffic Counts Sonnenberg
16 Requires `pandas` to be installed.
17 """
18 try:
19 import pandas as pd
20 except:
21 raise Exception("'HarzTraffic' requires `pandas` to be installed")
22 import os
23 import numpy as np
25 # Loading the data set
26 resource_package = os.path.dirname(__file__)
27 csv = os.path.join(resource_package, "data", "HarzTraffic.csv")
29 # Trying to read the data set
30 try:
31 data = pd.read_csv(csv)
32 except:
33 raise Exception("problems reading \"{csv}\"")
35 # Convert 'data' column to datetime
36 data.date = pd.to_datetime(data.date).dt.date
38 # Adding season
39 m = pd.DatetimeIndex(data.date).month
40 data["season"] = np.repeat("winter", data.shape[0])
41 data.loc[(m >= 3) & (m <= 5), "season"] = "spring"
42 data.loc[(m >= 6) & (m <= 8), "season"] = "summer"
43 data.loc[(m >= 9) & (m <= 11), "season"] = "autumn"
44 del m
46 # Boolean flag for 'weekend'
47 d = pd.DatetimeIndex(data.date).dayofweek
48 names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
49 data["dow"] = d # Use integer this for order
50 data["dayofweek"] = np.asarray([names[x] for x in d])
51 data["weekend"] = np.repeat(False, data.shape[0])
52 data.loc[(d >= 5), "weekend"] = True # Saturday (5) or Sunday (6)
53 del d
55 return data
57def _getdataset_MonthlyHarzTraffic():
58 """Monthly Traffic Counts Summary Sonnenberg
60 Requires `pandas` to be installed.
61 """
62 try:
63 import pandas as pd
64 except:
65 raise Exception("'HarzTraffic' requires `pandas` to be installed")
67 import numpy as np
69 # Loading the data set
70 from .datasets import dataset
71 data = dataset("HarzTraffic")
73 # Appending year and month
74 data["year"] = pd.DatetimeIndex(data.date).year
75 data["month"] = pd.DatetimeIndex(data.date).month
77 # Aggregating sums (a1) and means (a2) for specific columns
78 tmp = ["year", "month", "bikes", "cars", "trucks", "others", "rain"]
79 a1 = data.loc[:, tmp].groupby(["year", "month"]).agg("sum")
80 tmp = ["year", "month", "temp", "sunshine", "wind"]
81 a2 = data.loc[:, tmp].groupby(["year", "month"]).agg("mean")
83 # Merge and round to 1 digit
84 data = a1.merge(a2, on = ["year", "month"]).reset_index()
85 for col in data.columns:
86 if np.issubdtype(data.loc[:, col], np.floating):
87 data.loc[:, col] = np.round(data.loc[:, col], 1)
89 # Adding season flat
90 data["season"] = np.repeat("winter", data.shape[0])
91 data.loc[(data.month >= 3) & (data.month <= 5), "season"] = "spring"
92 data.loc[(data.month >= 6) & (data.month <= 8), "season"] = "summer"
93 data.loc[(data.month >= 9) & (data.month <= 11), "season"] = "autumn"
95 return data
99def dataset(name):
100 """Loading colorspace Package Example Data
102 The package `colorspace` comes with a few small data sets used
103 in the Examples and/or the documentation. This function allows
104 for easy access to these data sets. Note that some data sets
105 are require `pandas` to be installed.
107 #### **volcano**: Maunga Whau Volcano
109 Topographic information on Auckland's Maunga Whau Volcano on
110 a 10m x 10m grid. Will return a two-dimensional `numpy.ndarray`
111 of dimension 61x87 (int64).
113 Digitized from a topographic map by Ross Ihaka. These data should
114 not be regarded as accurate.
116 #### **HarzTraffic**: Daily Traffic Counts at Sonnenberg
118 The data set provides daily traffic counts for bikes (motor bikes),
119 cars, trucks, and other vehicles in the vicinity of Sonnenberg
120 located in the Harz region in Germany. The data set covers
121 a period of nearly three years (2021-01-01 to 2023-11-30).
123 A `pandas.DataFrame` containing 1057 observations (rows) on 16 variables:
125 * `date` date, the date of the record.
126 * `yday` int64, the day of the year.
127 * `bikes` int64, the number of motorcycles on that day.
128 * `cars` int64, the number of cars on that day.
129 * `trucks` int64, the number of trucks on that day.
130 * `others` int64, the number of other vehicles on that day.
131 * `tempmin` float64, minimum temperature in degrees Celsius.
132 * `tempmax` float64, maximum temperature in degrees Celsius.
133 * `temp` float64, mean temperature in degrees Celsius.
134 * `humidity` int64, mean relative humidity in percent.
135 * `tempdew` float64, average dewpoint temperature in degrees Celsius.
136 * `cloudiness` int64, average cloud cover in percent.
137 * `rain` float64, amount of precipitation in mm (snow and rain).
138 * `sunshine` int64, sunshine duration in minutes.
139 * `wind` float64, mean wind speed in meters per second.
140 * `windmax` float64, maximum wind speed in meters per second.
141 * `season`: object, local season (sprint, summer, autumn, winter).
142 * `dow`: int64, numeric day of week (0 = Mon, 6 = Sun).
143 * `dayofweek`: object, short name of day of week.
144 * `weekend`: bool, True if the day is Saturday or Sunday, else False.
146 Weather data: Deutscher Wetterdienst (DWD), Climate Data Center (CDC),
147 station Wernigerode (5490; Sachsen-Anhalt) w/ location 10.7686/51.8454/233
148 (lon, lat, alt, EPSG 4326). CC-BY 4.0, available via
149 <https://opendata.dwd.de/climate_environment/CDC/>.
151 Traffic data: Bundesanstalt für Strassenwesen (BASt), station Sonnenberg.
152 CC-BY 4.0, available via <https://www.bast.de>,
153 <https://www.bast.de/DE/Verkehrstechnik/Fachthemen/v2-verkehrszaehlung/Verkehrszaehlung.html>.
156 #### **MonthlyHarzTraffic**: Monthly Summary of Traffic Counts at Sonnenberg
158 Based on the daily data set `HarzTraffic` (see above) but aggregated on
159 a monthly basis.
161 A `pandas.DataFrame` containing 35 observations (rows) on 10 variables:
163 * `year`: int32, year of record.
164 * `month`: int32, year of record.
165 * `bikes`: int64, the total number of bikes in that month.
166 * `cars`: int64, the total number of cars in that month.
167 * `trucks`: int64, the total number of trucks in that month.
168 * `others`: int64, the total number of other vehicles in that month.
169 * `rain`: float64, monthly precipitation sum in mm (snow and rain).
170 * `temp`: float64, monthly mean temperature in degrees Celsius.
171 * `sunshine`: int64, monthly average of sunshine per day in minutes.
172 * `wind`: float64, monthly mean wind speed in meters per second.
173 * `season`: object, local season (sprint, summer, autumn, winter).
175 Data source and license: see data set description 'HarzTraffic'.
178 Args:
179 name (str): Name of the data set to be returned.
181 Returns:
182 The object returned depends on the data set (see above).
183 """
186 from . import datasets
187 from re import compile
189 if not isinstance(name, str):
190 raise TypeError("argument `name` must be str")
193 # Create listing of all available datasets
194 available = []
195 pattern = compile("(?<=(^_getdataset_))(.*)")
196 for fn in dir(datasets):
197 tmp = pattern.findall(fn)
198 if len(tmp) > 0: available.append(tmp[0][1])
200 try:
201 fun = getattr(datasets, f"_getdataset_{name}")
202 except:
203 raise ValueError(f"dataset \"{name}\" does not exist. " + \
204 f"Available data sets are: {', '.join(available)}.")
206 return fun()