Coverage for src/colorspace/datasets.py: 91%

69 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-29 15:11 +0000

1 

2 

3def _getdataset_volcano(): 

4 """Topographic Information on Auckland's Maunga Whau Volcano 

5 

6 Convenience function, for more details about this data set see 

7 man page for function :py:func:`get_volcano_data <colorspace.demos.get_volcano_data>`. 

8 """ 

9 from .demos import get_volcano_data 

10 return get_volcano_data(array = True) 

11 

12 

13def _getdataset_HarzTraffic(): 

14 """Daily Traffic Counts Sonnenberg 

15 

16 Requires `pandas` to be installed. 

17 """ 

18 try: 

19 import pandas as pd 

20 except: 

21 raise Exception("'HarzTraffic' requires `pandas` to be installed") 

22 import os 

23 import numpy as np 

24 

25 # Loading the data set 

26 resource_package = os.path.dirname(__file__) 

27 csv = os.path.join(resource_package, "data", "HarzTraffic.csv") 

28 

29 # Trying to read the data set 

30 try: 

31 data = pd.read_csv(csv) 

32 except: 

33 raise Exception("problems reading \"{csv}\"") 

34 

35 # Convert 'data' column to datetime 

36 data.date = pd.to_datetime(data.date).dt.date 

37 

38 # Adding season 

39 m = pd.DatetimeIndex(data.date).month 

40 data["season"] = np.repeat("winter", data.shape[0]) 

41 data.loc[(m >= 3) & (m <= 5), "season"] = "spring" 

42 data.loc[(m >= 6) & (m <= 8), "season"] = "summer" 

43 data.loc[(m >= 9) & (m <= 11), "season"] = "autumn" 

44 del m 

45 

46 # Boolean flag for 'weekend' 

47 d = pd.DatetimeIndex(data.date).dayofweek 

48 names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 

49 data["dow"] = d # Use integer this for order 

50 data["dayofweek"] = np.asarray([names[x] for x in d]) 

51 data["weekend"] = np.repeat(False, data.shape[0]) 

52 data.loc[(d >= 5), "weekend"] = True # Saturday (5) or Sunday (6) 

53 del d 

54 

55 return data 

56 

57def _getdataset_MonthlyHarzTraffic(): 

58 """Monthly Traffic Counts Summary Sonnenberg 

59 

60 Requires `pandas` to be installed. 

61 """ 

62 try: 

63 import pandas as pd 

64 except: 

65 raise Exception("'HarzTraffic' requires `pandas` to be installed") 

66 

67 import numpy as np 

68 

69 # Loading the data set 

70 from .datasets import dataset 

71 data = dataset("HarzTraffic") 

72 

73 # Appending year and month  

74 data["year"] = pd.DatetimeIndex(data.date).year 

75 data["month"] = pd.DatetimeIndex(data.date).month 

76 

77 # Aggregating sums (a1) and means (a2) for specific columns 

78 tmp = ["year", "month", "bikes", "cars", "trucks", "others", "rain"] 

79 a1 = data.loc[:, tmp].groupby(["year", "month"]).agg("sum") 

80 tmp = ["year", "month", "temp", "sunshine", "wind"] 

81 a2 = data.loc[:, tmp].groupby(["year", "month"]).agg("mean") 

82 

83 # Merge and round to 1 digit 

84 data = a1.merge(a2, on = ["year", "month"]).reset_index() 

85 for col in data.columns: 

86 if np.issubdtype(data.loc[:, col], np.floating): 

87 data.loc[:, col] = np.round(data.loc[:, col], 1) 

88 

89 # Adding season flat 

90 data["season"] = np.repeat("winter", data.shape[0]) 

91 data.loc[(data.month >= 3) & (data.month <= 5), "season"] = "spring" 

92 data.loc[(data.month >= 6) & (data.month <= 8), "season"] = "summer" 

93 data.loc[(data.month >= 9) & (data.month <= 11), "season"] = "autumn" 

94 

95 return data 

96 

97 

98 

99def dataset(name): 

100 """Loading colorspace Package Example Data 

101 

102 The package `colorspace` comes with a few small data sets used 

103 in the Examples and/or the documentation. This function allows 

104 for easy access to these data sets. Note that some data sets 

105 are require `pandas` to be installed. 

106 

107 #### **volcano**: Maunga Whau Volcano 

108 

109 Topographic information on Auckland's Maunga Whau Volcano on  

110 a 10m x 10m grid. Will return a two-dimensional `numpy.ndarray` 

111 of dimension 61x87 (int64). 

112 

113 Digitized from a topographic map by Ross Ihaka. These data should 

114 not be regarded as accurate. 

115 

116 #### **HarzTraffic**: Daily Traffic Counts at Sonnenberg 

117 

118 The data set provides daily traffic counts for bikes (motor bikes), 

119 cars, trucks, and other vehicles in the vicinity of Sonnenberg 

120 located in the Harz region in Germany. The data set covers 

121 a period of nearly three years (2021-01-01 to 2023-11-30). 

122 

123 A `pandas.DataFrame` containing 1057 observations (rows) on 16 variables: 

124 

125 * `date` date, the date of the record. 

126 * `yday` int64, the day of the year. 

127 * `bikes` int64, the number of motorcycles on that day. 

128 * `cars` int64, the number of cars on that day. 

129 * `trucks` int64, the number of trucks on that day. 

130 * `others` int64, the number of other vehicles on that day. 

131 * `tempmin` float64, minimum temperature in degrees Celsius. 

132 * `tempmax` float64, maximum temperature in degrees Celsius. 

133 * `temp` float64, mean temperature in degrees Celsius. 

134 * `humidity` int64, mean relative humidity in percent. 

135 * `tempdew` float64, average dewpoint temperature in degrees Celsius. 

136 * `cloudiness` int64, average cloud cover in percent. 

137 * `rain` float64, amount of precipitation in mm (snow and rain). 

138 * `sunshine` int64, sunshine duration in minutes. 

139 * `wind` float64, mean wind speed in meters per second. 

140 * `windmax` float64, maximum wind speed in meters per second. 

141 * `season`: object, local season (sprint, summer, autumn, winter). 

142 * `dow`: int64, numeric day of week (0 = Mon, 6 = Sun). 

143 * `dayofweek`: object, short name of day of week. 

144 * `weekend`: bool, True if the day is Saturday or Sunday, else False. 

145 

146 Weather data: Deutscher Wetterdienst (DWD), Climate Data Center (CDC), 

147 station Wernigerode (5490; Sachsen-Anhalt) w/ location 10.7686/51.8454/233 

148 (lon, lat, alt, EPSG 4326). CC-BY 4.0, available via 

149 <https://opendata.dwd.de/climate_environment/CDC/>. 

150 

151 Traffic data: Bundesanstalt für Strassenwesen (BASt), station Sonnenberg. 

152 CC-BY 4.0, available via <https://www.bast.de>, 

153 <https://www.bast.de/DE/Verkehrstechnik/Fachthemen/v2-verkehrszaehlung/Verkehrszaehlung.html>. 

154 

155 

156 #### **MonthlyHarzTraffic**: Monthly Summary of Traffic Counts at Sonnenberg 

157 

158 Based on the daily data set `HarzTraffic` (see above) but aggregated on 

159 a monthly basis. 

160 

161 A `pandas.DataFrame` containing 35 observations (rows) on 10 variables: 

162 

163 * `year`: int32, year of record. 

164 * `month`: int32, year of record. 

165 * `bikes`: int64, the total number of bikes in that month. 

166 * `cars`: int64, the total number of cars in that month. 

167 * `trucks`: int64, the total number of trucks in that month. 

168 * `others`: int64, the total number of other vehicles in that month. 

169 * `rain`: float64, monthly precipitation sum in mm (snow and rain). 

170 * `temp`: float64, monthly mean temperature in degrees Celsius. 

171 * `sunshine`: int64, monthly average of sunshine per day in minutes. 

172 * `wind`: float64, monthly mean wind speed in meters per second. 

173 * `season`: object, local season (sprint, summer, autumn, winter). 

174 

175 Data source and license: see data set description 'HarzTraffic'. 

176 

177 

178 Args: 

179 name (str): Name of the data set to be returned. 

180 

181 Returns: 

182 The object returned depends on the data set (see above). 

183 """ 

184 

185 

186 from . import datasets 

187 from re import compile 

188 

189 if not isinstance(name, str): 

190 raise TypeError("argument `name` must be str") 

191 

192 

193 # Create listing of all available datasets 

194 available = [] 

195 pattern = compile("(?<=(^_getdataset_))(.*)") 

196 for fn in dir(datasets): 

197 tmp = pattern.findall(fn) 

198 if len(tmp) > 0: available.append(tmp[0][1]) 

199 

200 try: 

201 fun = getattr(datasets, f"_getdataset_{name}") 

202 except: 

203 raise ValueError(f"dataset \"{name}\" does not exist. " + \ 

204 f"Available data sets are: {', '.join(available)}.") 

205 

206 return fun() 

207