Coverage for src/colorspace/statshelper.py: 100%

102 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-23 19:54 +0000

1 

2 

3def nprange(x): 

4 """Range of Values 

5 

6 Mimiking R's `range()` function, takes a numeric numpy array 

7 and returns an array of length `2` with the minimum and maximum. 

8 

9 Args: 

10 x (numpy.ndarray): Numpy array, must be numeric. 

11 

12 Returns: 

13 numpy.array: Retuns a numpy array with two elements 

14 containing the minimum of `x` and the maximum of `x`. 

15 """ 

16 import numpy as np 

17 if not isinstance(x, np.ndarray): 

18 raise TypeError("argument `x` must be a numpy.array") 

19 elif not np.issubdtype(x.dtype, np.float64) and not np.issubdtype(x.dtype, np.int64): 

20 raise TypeError("argument `x` must be float or int") 

21 elif not len(x) > 0 or len(x.shape) != 1: 

22 raise ValueError("argument `x` must be of length > 0 and 1D") 

23 

24 return np.asarray([np.min(x), np.max(x)]) 

25 

26 

27def natural_cubic_spline(x, y, xout): 

28 """Natural Cubic Spline Interpolation 

29  

30 Natural cubic spline interpolation. Takes two arrays `x` and `y` 

31 trough which a spline is fitted and evaluated at `xout`. Performs 

32 second-order (linear) extrapolation for values in `xout` outside 

33 of `x`. 

34  

35 Args: 

36 x (numpy.ndarray): original x data points. Must be float or int 

37 and length > 0. 

38 y (numpy.ndarray): original y data points, same requirements 

39 as `x`. Must also be of the same length as `y`. 

40 xout (numpy.ndarray): numeric vectotr (float or int; length > 0) 

41 at which the spline should be evaluated. 

42  

43 Returns: 

44 dict: Dictionary with two elements, `x` (same as input `xout`) 

45 and `y` with the interpolated values evaluated at `x` (`xout`). 

46  

47 Examples: 

48 >>> from colorspace.statshelper import natural_cubic_spline 

49 >>> import numpy as np 

50 >>> x = np.arange(10, 20.1, 0.5) 

51 >>> y = np.sin((x - 3) / 2) 

52 >>> xout = np.arange(0, 40, 0.2) 

53 >>>  

54 >>> res = natural_cubic_spline(x, y, xout) 

55 >>> 

56 >>> from matplotlib import pyplot as plt 

57 >>> plt.figure() 

58 >>> plt.plot(x, y, "o", label = "data points") 

59 >>> plt.plot(res["x"], res["y"], label = "cubic spline", color = "orange") 

60 >>> plt.legend() 

61 >>> plt.show() 

62 >>> 

63 >>> #: Example used for tests 

64 >>> x = np.asarray([1, 2, 3, 5.5, 6.5]) 

65 >>> y = np.asarray([6.5, 5.5, 5., 8.5, 9.5]) 

66 >>> xout = np.arange(-3, 9, 0.01) 

67 >>>  

68 >>> from colorspace.statshelper import natural_cubic_spline as ncs 

69 >>> res = ncs(x = x, y = y, xout = xout) 

70 >>> f = plt.figure() 

71 >>> plt.plot(res["x"], res["y"]) 

72 >>> plt.show() 

73 

74 """ 

75 import numpy as np 

76 

77 if not isinstance(x, np.ndarray): 

78 raise TypeError("argument `x` must be numpy.ndarray") 

79 if not isinstance(y, np.ndarray): 

80 raise TypeError("argument `y` must be numpy.ndarray") 

81 if not isinstance(xout, np.ndarray): 

82 raise TypeError("argument `xout` must be numpy.ndarray") 

83 if len(x) == 0: 

84 raise ValueError("array on `x` must be of length > 0") 

85 if not len(x) == len(y): 

86 raise ValueError("length of `x` and `y` must be identical") 

87 

88 def check(x): 

89 return np.issubdtype(x.dtype, np.float64) or np.issubdtype(x.dtype, np.integer) 

90 if not check(x): 

91 raise TypeError("argument `x` must be np.float or np.integer") 

92 if not check(y): 

93 raise TypeError("argument `y` must be np.float or np.integer") 

94 if not check(xout): 

95 raise TypeError("argument `xout` must be np.float or np.integer") 

96 if len(xout) == 0: 

97 raise ValueError("array on `xout` must be of length > 0") 

98 

99 # Enforce float 

100 y = y.astype(np.float32) 

101 x = x.astype(np.float32) 

102 xout = xout.astype(np.float32) 

103 

104 # If length of y is only 1 we can't fit a spline and the 

105 # return is simply a constant y[0] for each xout. 

106 if len(x) == 1: 

107 return {"x": xout, "y": np.repeat(y[0], len(xout))} 

108 

109 # Index length 

110 n = len(x) - 1 

111 

112 # Step 1: Calculate h_i 

113 h = np.diff(x) 

114 

115 # Step 2: Calculate the coefficients a, b, c, and d 

116 a = y[:-1] 

117 

118 # Step 3: Solve the tridiagonal system for c 

119 A = np.zeros((n + 1, n + 1)) 

120 b = np.zeros(n + 1) 

121 

122 # Natural spline boundary conditions 

123 A[0, 0] = 1 

124 A[n, n] = 1 

125 

126 for i in range(1, n): 

127 A[i, i - 1] = h[i-1] 

128 A[i, i] = 2 * (h[i - 1] + h[i]) 

129 A[i, i + 1] = h[i] 

130 b[i] = 3 * ((y[i + 1] - y[i]) / h[i] - (y[i] - y[i - 1]) / h[i - 1]) 

131 

132 c = np.linalg.solve(A, b) 

133 

134 # Step 4: Calculate b and d 

135 b = (y[1:] - y[:-1]) / h - h * (2 * c[:-1] + c[1:]) / 3 

136 d = (c[1:] - c[:-1]) / (3 * h) 

137 

138 # Organize coefficients 

139 coef = np.transpose([a, b, c[:-1], d]) 

140 

141 # Prediction/evaluation 

142 yout = np.zeros_like(xout) 

143 

144 for j in range(len(xout)): 

145 # Extrapolation left hand side (linear) 

146 if xout[j] < np.min(x): 

147 dx = np.min(x) - xout[j] 

148 yout[j] = y[0] - coef[0, 1] * dx - coef[0, 2] * dx 

149 # Extrapolation right hand side (linear) 

150 elif xout[j] > np.max(x): 

151 # Coef should be 0.8259146 

152 k = coef.shape[0] - 1 

153 dx = xout[j] - np.max(x) 

154 yout[j] = y[-1] + coef[k, 1] * dx + coef[k, 2] * dx 

155 # Else interpolate btw. two neighboring points 

156 else: 

157 for i in range(n): 

158 if x[i] <= xout[j] <= x[i + 1]: 

159 dx = xout[j] - x[i] 

160 yout[j] = coef[i, 0] + coef[i, 1] * dx + coef[i, 2] * dx**2 + coef[i, 3] * dx**3 

161 

162 return {"x": xout, "y": yout} 

163 

164 

165# Simple linear regression solver (OLS solver) 

166def lm(y, X, Xout): 

167 """Linear Regression 

168 

169 OLS solver for (simple) linear regression models. 

170 

171 Args: 

172 y (np.ndarray): response, one-dimensional array (float). 

173 X (np.ndarray): model matrix, two dimensional with observations in 

174 rows (number of rows equal to length of y; float). 

175 Xout (np.ndarray): Same format as `X`, the model matrix for which 

176 the predictions will be calculated and returned. 

177 

178 Returns: 

179 list: Returns a list containing the estimated regression coefficients 

180 (`coef`), the standard error of the residuals (`sigma`), and the 

181 predictions for `Xout` (on `Yout`; 1-d). 

182 

183 Example: 

184 >>> # Example from Rs stats::lm man page + additional linear effect. 

185 >>> # Annette Dobson (1990) "An Introduction to Generalized Linear Models".  

186 >>> # Page 9: Plant Weight Data.  

187 >>> weight = np.asarray([4.17, 5.58, 5.18, 6.11, 4.50, 4.61, 5.17, 4.53, 5.33, 5.14, 

188 >>> 4.81, 4.17, 4.41, 3.59, 5.87, 3.83, 6.03, 4.89, 4.32, 4.69]) 

189 >>>  

190 >>> # Dummy variable for 'Treatment group'  

191 >>> trt = np.repeat([0., 1.], 10) 

192 >>>  

193 >>> # Alternating +/-0.1 'noise' 

194 >>> rand = np.repeat(-0.1, 20) 

195 >>> rand[::2] = +0.1 

196 >>> rand = weight / 2 + rand 

197 >>>  

198 >>> # Create model matrix 

199 >>> from colorspace.statshelper import lm 

200 >>> X = np.transpose([np.repeat(1., 20), rand, trt])  

201 >>> mod = lm(y = weight, X = X, Xout = X) 

202 >>> print(mod) 

203 

204 Raises: 

205 TypeError: If `y`, `X`, `Xout` are not numpy.ndarrays. 

206 ValueError: If `X` or `Xout` are not two-dimensional. 

207 ValueError: If length `y` does not match first dimension of `X`. 

208 ValueError: If second dimension of `X` and `Xout` mismatch. 

209 """ 

210 import numpy as np 

211 if not isinstance(y, np.ndarray): 

212 raise TypeError("argument `y` must be numpy.ndarray") 

213 if not isinstance(X, np.ndarray): 

214 raise TypeError("argument `X` must be numpy.ndarray") 

215 if not isinstance(Xout, np.ndarray): 

216 raise TypeError("argument `Xout` must be numpy.ndarray") 

217 

218 # Shape 

219 if not len(X.shape) == 2 or not len(Xout.shape) == 2: 

220 raise ValueError("both `X` and `Xout` must be two-dimensional") 

221 if not len(y) == X.shape[0]: 

222 raise ValueError("length of `y` does not match first dimension of `X`") 

223 if not X.shape[1] == Xout.shape[1]: 

224 raise ValueError("second dimension of `X` and `Xout` do not match") 

225 

226 # Solving 

227 coef = np.linalg.lstsq(X, y, rcond=None)[0] 

228 

229 # Fitted values 

230 yfit = np.dot(X, coef) 

231 

232 # Residual standard error 

233 sigma = np.std(y - yfit, ddof = len(coef)) 

234 

235 # Prediction on Xout 

236 Yout = np.dot(Xout, coef) 

237 

238 return({"coef": coef, "sigma": sigma, "Yout": Yout}) 

239 

240 

241 

242def split(x, y): 

243 """Divide into Groups and Reassemble 

244 

245 Helper function mimiking Rs split() function. 

246 Takes two numpy arrays (x, y) of same length. 

247 Splits x in segments according to the values of y (whenever 

248 the value in y changes). E.g., 

249 

250 Args: 

251 x (numpy.ndarray): Array containing the data to be divided into groups. 

252 y (numpy.ndarray): Array of the same length as `x` containing the 

253 grouping information. The original array `x` will be cut at each 

254 element in `y` where the value changes. 

255 

256 Return: 

257 list of lists: Returns a list of lists, where each element in the 

258 object corresponds to one group, containing the original but now divided 

259 elements from `x`. 

260 

261 Examples: 

262 >>> tmp = np.asarray([1, 2, 3, 4, 5]) 

263 >>> split(tmp, tmp == 3) 

264 >>> [[1, 2], [3], [4, 5]] 

265  

266 >>> tmp = np.asarray([1, 2, 3, 4, 5]) 

267 >>> split(tmp, np.asarray([1, 1, 2, 2, 1])) 

268 >>> [[1, 2], [3, 4], [5]] 

269 """ 

270 import numpy as np 

271 if not isinstance(x, np.ndarray): 

272 raise TypeError("argument `x` must be numpy array") 

273 if not isinstance(y, np.ndarray): 

274 raise TypeError("argument `y` must be numpy array") 

275 if not len(x) > 0: 

276 raise ValueError("array x must be length >= 1") 

277 if not len(x) == len(y): 

278 raise ValueError("arrays x/y must be of same length") 

279 

280 # If length of x is one: Return as is 

281 if len(x) == 1: 

282 return [x] 

283 

284 # Start with list-of-lists containing first element 

285 res = [[x[0]]] 

286 for i in range(1, len(x)): 

287 if y[i] == y[i - 1]: res[len(res) - 1].append(x[i]) # Append 

288 else: res.append([x[i]]) # Add new list 

289 

290 return res 

291 

292 

293