#![warn(rust_2018_idioms)]
#![cfg_attr(feature = "unstable", feature(test))]
use std::borrow::Cow;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::iter;
use nalgebra::{DMatrix, DVector, RowDVector};
pub use error::{Error, InconsistentSlopes};
use special_functions::stdtr;
mod error;
mod special_functions;
macro_rules! ensure {
($predicate:expr, $error:expr) => {
if !$predicate {
return Err($error);
}
};
}
#[derive(Debug, Clone)]
pub struct FormulaRegressionBuilder<'a> {
data: Option<&'a RegressionData<'a>>,
formula: Option<Cow<'a, str>>,
columns: Option<(Cow<'a, str>, Vec<Cow<'a, str>>)>,
}
impl<'a> Default for FormulaRegressionBuilder<'a> {
fn default() -> Self {
FormulaRegressionBuilder::new()
}
}
impl<'a> FormulaRegressionBuilder<'a> {
pub fn new() -> Self {
FormulaRegressionBuilder {
data: None,
formula: None,
columns: None,
}
}
pub fn data(mut self, data: &'a RegressionData<'a>) -> Self {
self.data = Some(data);
self
}
pub fn formula<T: Into<Cow<'a, str>>>(mut self, formula: T) -> Self {
self.formula = Some(formula.into());
self
}
pub fn data_columns<I, S1, S2>(mut self, regressand: S1, regressors: I) -> Self
where
I: IntoIterator<Item = S2>,
S1: Into<Cow<'a, str>>,
S2: Into<Cow<'a, str>>,
{
let regressand = regressand.into();
let regressors: Vec<_> = regressors.into_iter().map(|i| i.into()).collect();
self.columns = Some((regressand, regressors));
self
}
pub fn fit(self) -> Result<RegressionModel, Error> {
let FittingData(input_vector, output_matrix, outputs) =
Self::get_matrices_and_regressor_names(self)?;
RegressionModel::try_from_matrices_and_regressor_names(input_vector, output_matrix, outputs)
}
pub fn fit_without_statistics(self) -> Result<RegressionParameters, Error> {
let FittingData(input_vector, output_matrix, output_names) =
Self::get_matrices_and_regressor_names(self)?;
let low_level_result = fit_ols_pinv(input_vector, output_matrix)?;
let parameters = low_level_result.params;
let intercept = parameters[0];
let slopes: Vec<_> = parameters.iter().cloned().skip(1).collect();
ensure!(
output_names.len() == slopes.len(),
Error::InconsistentSlopes(InconsistentSlopes::new(output_names.len(), slopes.len()))
);
Ok(RegressionParameters {
intercept_value: intercept,
regressor_values: slopes,
regressor_names: output_names.to_vec(),
})
}
fn get_matrices_and_regressor_names(self) -> Result<FittingData, Error> {
let (input, outputs) = self.get_data_columns()?;
let data = &self.data.ok_or(Error::NoData)?.data;
let input_vector = data
.get(input.as_ref())
.ok_or_else(|| Error::ColumnNotInData(input.into()))?;
let input_vector = RowDVector::from_vec(input_vector.to_vec());
let mut output_matrix = Vec::new();
let all_ones_column = iter::repeat(1.).take(input_vector.len());
output_matrix.extend(all_ones_column);
for output in outputs.to_owned() {
let output_vec = data
.get(output.as_ref())
.ok_or_else(|| Error::ColumnNotInData(output.to_string()))?;
ensure!(
output_vec.len() == input_vector.len(),
Error::RegressorRegressandDimensionMismatch(output.to_string())
);
output_matrix.extend(output_vec.iter());
}
let output_matrix = DMatrix::from_vec(input_vector.len(), outputs.len() + 1, output_matrix);
let outputs: Vec<_> = outputs.iter().map(|x| x.to_string()).collect();
Ok(FittingData(input_vector, output_matrix, outputs))
}
fn get_data_columns(&self) -> Result<(Cow<'_, str>, Vec<Cow<'_, str>>), Error> {
match (self.formula.as_ref(), self.columns.as_ref()) {
(Some(..), Some(..)) => Err(Error::BothFormulaAndDataColumnsGiven),
(Some(formula), None) => Self::parse_formula(formula),
(None, Some((regressand, regressors))) => {
ensure!(!regressors.is_empty(), Error::InvalidDataColumns);
Ok((regressand.clone(), regressors.clone()))
}
(None, None) => Err(Error::NoFormula),
}
}
fn parse_formula(formula: &str) -> Result<(Cow<'_, str>, Vec<Cow<'_, str>>), Error> {
let split_formula: Vec<_> = formula.split('~').collect();
ensure!(split_formula.len() == 2, Error::InvalidFormula);
let input = split_formula[0].trim();
let outputs: Vec<_> = split_formula[1]
.split('+')
.map(str::trim)
.filter(|x| !x.is_empty())
.map(|i| i.into())
.collect();
ensure!(!outputs.is_empty(), Error::InvalidFormula);
Ok((input.into(), outputs))
}
}
struct FittingData(RowDVector<f64>, DMatrix<f64>, Vec<String>);
#[derive(Debug, Clone)]
pub struct RegressionData<'a> {
data: HashMap<Cow<'a, str>, Vec<f64>>,
}
impl<'a> RegressionData<'a> {
fn new<I, S>(
data: I,
invalid_value_handling: InvalidValueHandling,
) -> Result<RegressionData<'a>, Error>
where
I: IntoIterator<Item = (S, Vec<f64>)>,
S: Into<Cow<'a, str>>,
{
let temp: HashMap<_, _> = data
.into_iter()
.map(|(key, value)| (key.into(), value))
.collect();
let first_key = temp.keys().next();
ensure!(
first_key.is_some(),
Error::RegressionDataError("The data contains no columns.".into())
);
let first_key = first_key.unwrap();
let first_len = temp[first_key].len();
ensure!(
first_len > 0,
Error::RegressionDataError("The data contains an empty column.".into())
);
for key in temp.keys() {
let this_len = temp[key].len();
ensure!(
this_len == first_len,
Error::RegressionDataError(
"The lengths of the columns in the given data are inconsistent.".into()
)
);
ensure!(
!key.contains('~') && !key.contains('+'),
Error::RegressionDataError(
"The column names may not contain `~` or `+`, because they are used \
as separators in the formula."
.into()
)
);
}
if Self::check_if_all_columns_are_equal(&temp) {
return Err(Error::RegressionDataError(
"All input columns contain only equal values. Fitting this model would lead \
to invalid statistics."
.into(),
));
}
if Self::check_if_data_is_valid(&temp) {
return Ok(Self { data: temp });
}
match invalid_value_handling {
InvalidValueHandling::ReturnError => Err(Error::RegressionDataError(
"The data contains a non real value (NaN or infinity or negative infinity). \
If you would like to silently drop these values configure the builder with \
InvalidValueHandling::DropInvalid."
.into(),
)),
InvalidValueHandling::DropInvalid => {
let temp = Self::drop_invalid_values(temp);
let first_key = temp.keys().next().expect("Cleaned data has no columns.");
let first_len = temp[first_key].len();
ensure!(
first_len > 0,
Error::RegressionDataError("The cleaned data is empty.".into())
);
Ok(Self { data: temp })
}
}
}
fn check_if_all_columns_are_equal(data: &HashMap<Cow<'a, str>, Vec<f64>>) -> bool {
for column in data.values() {
if column.is_empty() {
return false;
}
let first_iter = iter::repeat(&column[0]).take(column.len());
if !first_iter.eq(column.iter()) {
return false;
}
}
true
}
fn check_if_data_is_valid(data: &HashMap<Cow<'a, str>, Vec<f64>>) -> bool {
for column in data.values() {
if column.iter().any(|x| !x.is_finite()) {
return false;
}
}
true
}
fn drop_invalid_values(
data: HashMap<Cow<'a, str>, Vec<f64>>,
) -> HashMap<Cow<'a, str>, Vec<f64>> {
let mut invalid_rows: BTreeSet<usize> = BTreeSet::new();
for column in data.values() {
for (index, value) in column.iter().enumerate() {
if !value.is_finite() {
invalid_rows.insert(index);
}
}
}
let mut cleaned_data = HashMap::new();
for (key, mut column) in data {
for index in invalid_rows.iter().rev() {
column.remove(*index);
}
cleaned_data.insert(key, column);
}
cleaned_data
}
}
#[derive(Debug, Clone, Copy)]
pub struct RegressionDataBuilder {
handle_invalid_values: InvalidValueHandling,
}
impl Default for RegressionDataBuilder {
fn default() -> RegressionDataBuilder {
RegressionDataBuilder {
handle_invalid_values: InvalidValueHandling::default(),
}
}
}
impl RegressionDataBuilder {
pub fn new() -> Self {
Self::default()
}
pub fn invalid_value_handling(mut self, setting: InvalidValueHandling) -> Self {
self.handle_invalid_values = setting;
self
}
pub fn build_from<'a, I, S>(self, data: I) -> Result<RegressionData<'a>, Error>
where
I: IntoIterator<Item = (S, Vec<f64>)>,
S: Into<Cow<'a, str>>,
{
RegressionData::new(data, self.handle_invalid_values)
}
}
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub enum InvalidValueHandling {
ReturnError,
DropInvalid,
}
impl Default for InvalidValueHandling {
fn default() -> InvalidValueHandling {
InvalidValueHandling::ReturnError
}
}
#[derive(Debug, Clone)]
pub struct RegressionModel {
pub parameters: RegressionParameters,
pub se: RegressionParameters,
pub ssr: f64,
pub rsquared: f64,
pub rsquared_adj: f64,
pub pvalues: RegressionParameters,
pub residuals: RegressionParameters,
pub scale: f64,
}
impl RegressionModel {
pub fn predict<'a, I, S>(&self, new_data: I) -> Result<Vec<f64>, Error>
where
I: IntoIterator<Item = (S, Vec<f64>)>,
S: Into<Cow<'a, str>>,
{
let new_data: HashMap<Cow<'_, _>, Vec<f64>> = new_data
.into_iter()
.map(|(key, value)| (key.into(), value))
.collect();
self.check_variables(&new_data)?;
let input_len = new_data.values().next().unwrap().len();
let mut new_data_values: Vec<f64> = vec![];
for key in &self.parameters.regressor_names {
new_data_values.extend_from_slice(new_data[&Cow::from(key)].as_slice());
}
let new_data_matrix = DMatrix::from_vec(
input_len,
self.parameters.regressor_values.len(),
new_data_values,
);
let param_matrix = DMatrix::from_iterator(
self.parameters.regressor_values.len(),
1,
self.parameters.regressor_values.iter().copied(),
);
let intercept = self.parameters.intercept_value;
let intercept_matrix =
DMatrix::from_iterator(input_len, 1, std::iter::repeat(intercept).take(input_len));
let predictions = (new_data_matrix * param_matrix) + intercept_matrix;
let predictions: Vec<f64> = predictions.into_iter().copied().collect();
Ok(predictions)
}
fn check_variables<'a>(
&self,
given_parameters: &HashMap<Cow<'a, str>, Vec<f64>>,
) -> Result<(), Error> {
ensure!(!given_parameters.is_empty(), Error::NoData);
let first_len = given_parameters.values().next().unwrap().len();
ensure!(first_len > 0, Error::NoData);
ensure!(
self.parameters.regressor_names.len() == self.parameters.regressor_values.len(),
Error::InconsistentRegressionModel
);
let model_parameters: HashSet<_> = self
.parameters
.regressor_names
.iter()
.map(Cow::from)
.collect();
for param in &model_parameters {
if !given_parameters.contains_key(param) {
return Err(Error::ColumnNotInData(param.to_string()));
}
}
for (param, values) in given_parameters {
ensure!(values.len() == first_len, Error::InconsistentVectors);
if !model_parameters.contains(param) {
return Err(Error::ModelColumnNotInData(param.to_string()));
}
}
Ok(())
}
fn try_from_matrices_and_regressor_names<I: IntoIterator<Item = String>>(
inputs: RowDVector<f64>,
outputs: DMatrix<f64>,
output_names: I,
) -> Result<Self, Error> {
let low_level_result = fit_ols_pinv(inputs.to_owned(), outputs.to_owned())?;
let LowLevelRegressionModel {
parameters,
se,
ssr,
rsquared,
rsquared_adj,
pvalues,
residuals,
scale,
} = LowLevelRegressionModel::from_low_level_regression(low_level_result)?;
let output_names: Vec<_> = output_names.into_iter().collect();
let intercept = parameters[0];
let slopes: Vec<f64> = parameters.into_iter().skip(1).collect();
ensure!(
output_names.len() == slopes.len(),
Error::InconsistentSlopes(InconsistentSlopes::new(output_names.len(), slopes.len(),))
);
let parameters = RegressionParameters {
intercept_value: intercept,
regressor_values: slopes,
regressor_names: output_names.to_vec(),
};
let se = RegressionParameters {
intercept_value: se[0],
regressor_values: se.iter().cloned().skip(1).collect(),
regressor_names: output_names.to_vec(),
};
let residuals = RegressionParameters {
intercept_value: residuals[0],
regressor_values: residuals.iter().cloned().skip(1).collect(),
regressor_names: output_names.to_vec(),
};
let pvalues = RegressionParameters {
intercept_value: pvalues[0],
regressor_values: pvalues.iter().cloned().skip(1).collect(),
regressor_names: output_names.to_vec(),
};
Ok(Self {
parameters,
se,
ssr,
rsquared,
rsquared_adj,
pvalues,
residuals,
scale,
})
}
}
#[derive(Debug, Clone)]
pub struct LowLevelRegressionModel {
pub parameters: Vec<f64>,
pub se: Vec<f64>,
pub ssr: f64,
pub rsquared: f64,
pub rsquared_adj: f64,
pub pvalues: Vec<f64>,
pub residuals: Vec<f64>,
pub scale: f64,
}
impl LowLevelRegressionModel {
fn from_low_level_regression(
low_level_result: InternalLowLevelRegressionResult,
) -> Result<Self, Error> {
let parameters = low_level_result.params;
let singular_values = low_level_result.singular_values;
let normalized_cov_params = low_level_result.normalized_cov_params;
let diag = DMatrix::from_diagonal(&singular_values);
let rank = &diag.rank(0.0);
let input_vec: Vec<_> = low_level_result.inputs.iter().copied().collect();
let input_matrix = DMatrix::from_vec(low_level_result.inputs.len(), 1, input_vec);
let residuals = &input_matrix - (low_level_result.outputs * parameters.to_owned());
let ssr = residuals.dot(&residuals);
let n = low_level_result.inputs.ncols();
let df_resid = n - rank;
ensure!(
df_resid >= 1,
Error::ModelFittingError(
"There are not enough residual degrees of freedom to perform statistics on this model".into()));
let scale = residuals.dot(&residuals) / df_resid as f64;
let cov_params = normalized_cov_params * scale;
let se = get_se_from_cov_params(&cov_params)?;
let centered_input_matrix = subtract_value_from_matrix(&input_matrix, input_matrix.mean());
let centered_tss = ¢ered_input_matrix.dot(¢ered_input_matrix);
let rsquared = 1. - (ssr / centered_tss);
let rsquared_adj = 1. - ((n - 1) as f64 / df_resid as f64 * (1. - rsquared));
let tvalues: Vec<_> = parameters
.iter()
.zip(se.iter())
.map(|(x, y)| x / y)
.collect();
let pvalues: Vec<_> = tvalues
.iter()
.cloned()
.map(|x| stdtr(df_resid as i64, -(x.abs())) * 2.)
.collect();
let parameters: Vec<f64> = parameters.iter().copied().collect();
let se: Vec<f64> = se.iter().copied().collect();
let residuals: Vec<f64> = residuals.iter().copied().collect();
Ok(Self {
parameters,
se,
ssr,
rsquared,
rsquared_adj,
pvalues,
residuals,
scale,
})
}
}
#[derive(Debug, Clone)]
pub struct RegressionParameters {
pub intercept_value: f64,
pub regressor_names: Vec<String>,
pub regressor_values: Vec<f64>,
}
impl RegressionParameters {
pub fn pairs(&self) -> Vec<(&str, f64)> {
self.regressor_names
.iter()
.zip(self.regressor_values.iter())
.map(|(x, y)| (x.as_str(), *y))
.collect()
}
}
pub fn fit_low_level_regression_model(
data_row_major: &[f64],
num_rows: usize,
num_columns: usize,
) -> Result<LowLevelRegressionModel, Error> {
let regression = get_low_level_regression(data_row_major, num_rows, num_columns)?;
let model = LowLevelRegressionModel::from_low_level_regression(regression)?;
Ok(model)
}
pub fn fit_low_level_regression_model_without_statistics(
data_row_major: &[f64],
num_rows: usize,
num_columns: usize,
) -> Result<Vec<f64>, Error> {
let regression = get_low_level_regression(data_row_major, num_rows, num_columns)?;
Ok(regression.params.iter().copied().collect())
}
fn get_low_level_regression(
data_row_major: &[f64],
num_rows: usize,
num_columns: usize,
) -> Result<InternalLowLevelRegressionResult, Error> {
ensure!(
!data_row_major.is_empty() && num_rows * num_columns == data_row_major.len(),
Error::InconsistentVectors
);
let data = DMatrix::from_row_slice(num_rows, num_columns, data_row_major);
let inputs = data.slice((0, 0), (num_rows, 1));
let inputs: RowDVector<f64> = RowDVector::from_iterator(num_rows, inputs.iter().copied());
let outputs: DMatrix<f64> = data.slice((0, 1), (num_rows, num_columns - 1)).into_owned();
fit_ols_pinv(inputs, outputs)
}
#[derive(Debug, Clone)]
struct InternalLowLevelRegressionResult {
inputs: RowDVector<f64>,
outputs: DMatrix<f64>,
params: DMatrix<f64>,
singular_values: DVector<f64>,
normalized_cov_params: DMatrix<f64>,
}
fn fit_ols_pinv(
inputs: RowDVector<f64>,
outputs: DMatrix<f64>,
) -> Result<InternalLowLevelRegressionResult, Error> {
ensure!(
!inputs.is_empty(),
Error::ModelFittingError(
"Fitting the model failed because the input vector is empty".into()
)
);
ensure!(
outputs.nrows() >= 1 && outputs.ncols() >= 1,
Error::ModelFittingError(
"Fitting the model failed because the output matrix is empty".into()
)
);
let singular_values = outputs
.to_owned()
.try_svd(false, false, std::f64::EPSILON, 0)
.ok_or_else(|| {
Error::ModelFittingError(
"computing the singular-value decomposition of the output matrix failed".into(),
)
})?
.singular_values;
let pinv = outputs.clone().pseudo_inverse(0.).map_err(|_| {
Error::ModelFittingError("Taking the pinv of the output matrix failed".into())
});
let pinv = pinv?;
let normalized_cov_params = &pinv * &pinv.transpose();
let params = get_sum_of_products(&pinv, &inputs);
ensure!(
params.len() >= 2,
Error::ModelFittingError("Invalid parameter matrix".into())
);
Ok(InternalLowLevelRegressionResult {
inputs,
outputs,
params,
singular_values,
normalized_cov_params,
})
}
fn subtract_value_from_matrix(matrix: &DMatrix<f64>, value: f64) -> DMatrix<f64> {
let nrows = matrix.nrows();
let ncols = matrix.ncols();
let substraction_matrix: DMatrix<f64> =
DMatrix::from_iterator(nrows, ncols, std::iter::repeat(value).take(nrows * ncols));
matrix - substraction_matrix
}
fn get_se_from_cov_params(matrix: &DMatrix<f64>) -> Result<DMatrix<f64>, Error> {
let mut v = Vec::new();
for row_index in 0..matrix.ncols() {
let row = matrix.row(row_index);
ensure!(
row_index <= row.len(),
Error::ModelFittingError("Matrix is not square".into())
);
v.push(row[row_index].sqrt());
}
Ok(DMatrix::from_vec(matrix.ncols(), 1, v))
}
fn get_sum_of_products(matrix: &DMatrix<f64>, vector: &RowDVector<f64>) -> DMatrix<f64> {
let mut v: Vec<f64> = Vec::new();
for row_index in 0..matrix.nrows() {
let row = matrix.row(row_index);
let mut sum = 0.;
for (x, y) in row.iter().zip(vector.iter()) {
sum += x * y;
}
v.push(sum);
}
DMatrix::from_vec(matrix.nrows(), 1, v)
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_almost_equal(a: f64, b: f64) {
assert_almost_equal_with_precision(a, b, 1.0E-14);
}
fn assert_almost_equal_with_precision(a: f64, b: f64, precision: f64) {
if (a - b).abs() > precision {
panic!("\n{:?} vs\n{:?}", a, b);
}
}
fn assert_slices_almost_equal(a: &[f64], b: &[f64]) {
assert_eq!(a.len(), b.len());
for (x, y) in a.iter().cloned().zip(b.iter().cloned()).collect::<Vec<_>>() {
assert_almost_equal(x, y);
}
}
#[test]
fn test_pinv_with_formula_builder() {
use std::collections::HashMap;
let inputs = vec![1., 3., 4., 5., 2., 3., 4.];
let outputs1 = vec![1., 2., 3., 4., 5., 6., 7.];
let outputs2 = vec![7., 6., 5., 4., 3., 2., 1.];
let mut data = HashMap::new();
data.insert("Y", inputs);
data.insert("X1", outputs1);
data.insert("X2", outputs2);
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let regression = FormulaRegressionBuilder::new()
.data(&data)
.formula("Y ~ X1 + X2")
.fit()
.expect("Fitting model failed");
let model_parameters = vec![0.09523809523809523, 0.5059523809523809, 0.2559523809523808];
let se = vec![
0.015457637291218289,
0.1417242813072997,
0.14172428130729975,
];
let ssr = 9.107142857142858;
let rsquared = 0.16118421052631582;
let rsquared_adj = -0.006578947368421018;
let scale = 1.8214285714285716;
let pvalues = vec![
0.001639031204417556,
0.016044083709847945,
0.13074580446389245,
];
let residuals = vec![
-1.392857142857142,
0.3571428571428581,
1.1071428571428577,
1.8571428571428577,
-1.3928571428571423,
-0.6428571428571423,
0.10714285714285765,
];
assert_almost_equal(regression.parameters.intercept_value, model_parameters[0]);
assert_almost_equal(
regression.parameters.regressor_values[0],
model_parameters[1],
);
assert_almost_equal(
regression.parameters.regressor_values[1],
model_parameters[2],
);
assert_almost_equal(regression.se.intercept_value, se[0]);
assert_slices_almost_equal(®ression.se.regressor_values, &se[1..]);
assert_almost_equal(regression.ssr, ssr);
assert_almost_equal(regression.rsquared, rsquared);
assert_almost_equal(regression.rsquared_adj, rsquared_adj);
assert_almost_equal(regression.pvalues.intercept_value, pvalues[0]);
assert_slices_almost_equal(®ression.pvalues.regressor_values, &pvalues[1..]);
assert_almost_equal(regression.residuals.intercept_value, residuals[0]);
assert_slices_almost_equal(®ression.residuals.regressor_values, &residuals[1..]);
assert_eq!(regression.scale, scale);
}
#[test]
fn test_pinv_with_data_columns() {
use std::collections::HashMap;
let inputs = vec![1., 3., 4., 5., 2., 3., 4.];
let outputs1 = vec![1., 2., 3., 4., 5., 6., 7.];
let outputs2 = vec![7., 6., 5., 4., 3., 2., 1.];
let mut data = HashMap::new();
data.insert("Y", inputs);
data.insert("X1", outputs1);
data.insert("X2", outputs2);
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let regression = FormulaRegressionBuilder::new()
.data(&data)
.data_columns("Y", ["X1", "X2"])
.fit()
.expect("Fitting model failed");
let model_parameters = vec![0.09523809523809523, 0.5059523809523809, 0.2559523809523808];
let se = vec![
0.015457637291218289,
0.1417242813072997,
0.14172428130729975,
];
let ssr = 9.107142857142858;
let rsquared = 0.16118421052631582;
let rsquared_adj = -0.006578947368421018;
let scale = 1.8214285714285716;
let pvalues = vec![
0.001639031204417556,
0.016044083709847945,
0.13074580446389245,
];
let residuals = vec![
-1.392857142857142,
0.3571428571428581,
1.1071428571428577,
1.8571428571428577,
-1.3928571428571423,
-0.6428571428571423,
0.10714285714285765,
];
assert_almost_equal(regression.parameters.intercept_value, model_parameters[0]);
assert_almost_equal(
regression.parameters.regressor_values[0],
model_parameters[1],
);
assert_almost_equal(
regression.parameters.regressor_values[1],
model_parameters[2],
);
assert_almost_equal(regression.se.intercept_value, se[0]);
assert_slices_almost_equal(®ression.se.regressor_values, &se[1..]);
assert_almost_equal(regression.ssr, ssr);
assert_almost_equal(regression.rsquared, rsquared);
assert_almost_equal(regression.rsquared_adj, rsquared_adj);
assert_almost_equal(regression.pvalues.intercept_value, pvalues[0]);
assert_slices_almost_equal(®ression.pvalues.regressor_values, &pvalues[1..]);
assert_almost_equal(regression.residuals.intercept_value, residuals[0]);
assert_slices_almost_equal(®ression.residuals.regressor_values, &residuals[1..]);
assert_eq!(regression.scale, scale);
}
#[test]
fn test_low_level_model_fitting() {
let inputs = vec![1., 3., 4., 5., 2., 3., 4.];
let outputs1 = vec![1., 2., 3., 4., 5., 6., 7.];
let outputs2 = vec![7., 6., 5., 4., 3., 2., 1.];
let mut data_row_major = Vec::with_capacity(4 * 7);
for n in 0..7 {
data_row_major.push(inputs[n]);
data_row_major.push(1.0);
data_row_major.push(outputs1[n]);
data_row_major.push(outputs2[n]);
}
let regression = fit_low_level_regression_model(&data_row_major, 7, 4).unwrap();
let model_parameters = vec![0.09523809523809523, 0.5059523809523809, 0.2559523809523808];
let se = vec![
0.015457637291218289,
0.1417242813072997,
0.14172428130729975,
];
let ssr = 9.107142857142858;
let rsquared = 0.16118421052631582;
let rsquared_adj = -0.006578947368421018;
let scale = 1.8214285714285716;
let pvalues = vec![
0.001639031204417556,
0.016044083709847945,
0.13074580446389245,
];
let residuals = vec![
-1.392857142857142,
0.3571428571428581,
1.1071428571428577,
1.8571428571428577,
-1.3928571428571423,
-0.6428571428571423,
0.10714285714285765,
];
assert_slices_almost_equal(®ression.parameters, &model_parameters);
assert_slices_almost_equal(®ression.se, &se);
assert_almost_equal(regression.ssr, ssr);
assert_almost_equal(regression.rsquared, rsquared);
assert_almost_equal(regression.rsquared_adj, rsquared_adj);
assert_slices_almost_equal(®ression.pvalues, &pvalues);
assert_slices_almost_equal(®ression.residuals, &residuals);
assert_eq!(regression.scale, scale);
}
#[test]
fn test_without_statistics() {
use std::collections::HashMap;
let inputs = vec![1., 3., 4., 5., 2., 3., 4.];
let outputs1 = vec![1., 2., 3., 4., 5., 6., 7.];
let outputs2 = vec![7., 6., 5., 4., 3., 2., 1.];
let mut data = HashMap::new();
data.insert("Y", inputs);
data.insert("X1", outputs1);
data.insert("X2", outputs2);
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let regression = FormulaRegressionBuilder::new()
.data(&data)
.formula("Y ~ X1 + X2")
.fit_without_statistics()
.expect("Fitting model failed");
let model_parameters = vec![0.09523809523809523, 0.5059523809523809, 0.2559523809523808];
assert_almost_equal(regression.intercept_value, model_parameters[0]);
assert_almost_equal(regression.regressor_values[0], model_parameters[1]);
assert_almost_equal(regression.regressor_values[1], model_parameters[2]);
}
#[test]
fn test_invalid_input_empty_matrix() {
let y = vec![];
let x1 = vec![];
let x2 = vec![];
let data = vec![("Y", y), ("X1", x1), ("X2", x2)];
let data = RegressionDataBuilder::new().build_from(data);
assert!(data.is_err());
}
#[test]
fn test_invalid_input_wrong_shape_x() {
let y = vec![1., 2., 3.];
let x1 = vec![1., 2., 3.];
let x2 = vec![1., 2.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2)];
let data = RegressionDataBuilder::new().build_from(data);
assert!(data.is_err());
}
#[test]
fn test_invalid_input_wrong_shape_y() {
let y = vec![1., 2., 3., 4.];
let x1 = vec![1., 2., 3.];
let x2 = vec![1., 2., 3.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2)];
let data = RegressionDataBuilder::new().build_from(data);
assert!(data.is_err());
}
#[test]
fn test_invalid_input_nan() {
let y1 = vec![1., 2., 3., 4.];
let x1 = vec![1., 2., 3., std::f64::NAN];
let data1 = vec![("Y", y1), ("X", x1)];
let y2 = vec![1., 2., 3., std::f64::NAN];
let x2 = vec![1., 2., 3., 4.];
let data2 = vec![("Y", y2), ("X", x2)];
let r_data1 = RegressionDataBuilder::new().build_from(data1.to_owned());
let r_data2 = RegressionDataBuilder::new().build_from(data2.to_owned());
assert!(r_data1.is_err());
assert!(r_data2.is_err());
let builder = RegressionDataBuilder::new();
let builder = builder.invalid_value_handling(InvalidValueHandling::DropInvalid);
let r_data1 = builder.build_from(data1);
let r_data2 = builder.build_from(data2);
assert!(r_data1.is_ok());
assert!(r_data2.is_ok());
}
#[test]
fn test_invalid_input_infinity() {
let y1 = vec![1., 2., 3., 4.];
let x1 = vec![1., 2., 3., std::f64::INFINITY];
let data1 = vec![("Y", y1), ("X", x1)];
let y2 = vec![1., 2., 3., std::f64::NEG_INFINITY];
let x2 = vec![1., 2., 3., 4.];
let data2 = vec![("Y", y2), ("X", x2)];
let r_data1 = RegressionDataBuilder::new().build_from(data1.to_owned());
let r_data2 = RegressionDataBuilder::new().build_from(data2.to_owned());
assert!(r_data1.is_err());
assert!(r_data2.is_err());
let builder = RegressionDataBuilder::new();
let builder = builder.invalid_value_handling(InvalidValueHandling::DropInvalid);
let r_data1 = builder.build_from(data1);
let r_data2 = builder.build_from(data2);
assert!(r_data1.is_ok());
assert!(r_data2.is_ok());
}
#[test]
fn test_invalid_input_all_equal_columns() {
let y = vec![38.0, 38.0, 38.0];
let x = vec![42.0, 42.0, 42.0];
let data = vec![("y", y), ("x", x)];
let data = RegressionDataBuilder::new().build_from(data);
assert!(data.is_err());
}
#[test]
fn test_drop_invalid_values() {
let mut data: HashMap<Cow<'_, str>, Vec<f64>> = HashMap::new();
data.insert("Y".into(), vec![-1., -2., -3., -4.]);
data.insert("foo".into(), vec![1., 2., 12., 4.]);
data.insert("bar".into(), vec![1., 1., 7., 4.]);
data.insert("baz".into(), vec![1.3333, 2.754, 3.12, 4.11]);
assert_eq!(RegressionData::drop_invalid_values(data.to_owned()), data);
data.insert(
"invalid".into(),
vec![std::f64::NAN, 42., std::f64::NEG_INFINITY, 23.11],
);
data.insert(
"invalid2".into(),
vec![1.337, -3.14, std::f64::INFINITY, 11.111111],
);
let mut ref_data: HashMap<Cow<'_, str>, Vec<f64>> = HashMap::new();
ref_data.insert("Y".into(), vec![-2., -4.]);
ref_data.insert("foo".into(), vec![2., 4.]);
ref_data.insert("bar".into(), vec![1., 4.]);
ref_data.insert("baz".into(), vec![2.754, 4.11]);
ref_data.insert("invalid".into(), vec![42., 23.11]);
ref_data.insert("invalid2".into(), vec![-3.14, 11.111111]);
assert_eq!(
ref_data,
RegressionData::drop_invalid_values(data.to_owned())
);
}
#[test]
fn test_all_invalid_input() {
let data = vec![
("Y", vec![1., 2., 3.]),
("X", vec![std::f64::NAN, std::f64::NAN, std::f64::NAN]),
];
let builder = RegressionDataBuilder::new();
let builder = builder.invalid_value_handling(InvalidValueHandling::DropInvalid);
let r_data = builder.build_from(data);
assert!(r_data.is_err());
}
#[test]
fn test_invalid_column_names() {
let data1 = vec![("x~f", vec![1., 2., 3.]), ("foo", vec![0., 0., 0.])];
let data2 = vec![("foo", vec![1., 2., 3.]), ("foo+", vec![0., 0., 0.])];
let builder = RegressionDataBuilder::new();
assert!(builder.build_from(data1).is_err());
assert!(builder.build_from(data2).is_err());
}
#[test]
fn test_no_formula() {
let data = vec![("x", vec![1., 2., 3.]), ("foo", vec![0., 0., 0.])];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let res = FormulaRegressionBuilder::new().data(&data).fit();
assert!(res.is_err());
}
#[test]
fn test_both_formula_and_data_columns() {
let y = vec![1., 2., 3., 4., 5.];
let x1 = vec![5., 4., 3., 2., 1.];
let x2 = vec![729.53, 439.0367, 42.054, 1., 0.];
let x3 = vec![258.589, 616.297, 215.061, 498.361, 0.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2), ("X3", x3)];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let formula = "Y ~ X1 + X2 + X3";
let res = FormulaRegressionBuilder::new()
.data(&data)
.formula(formula)
.data_columns("Y", ["X1", "X2", "X3"])
.fit();
assert!(res.is_err());
}
fn build_model() -> RegressionModel {
let y = vec![1., 2., 3., 4., 5.];
let x1 = vec![5., 4., 3., 2., 1.];
let x2 = vec![729.53, 439.0367, 42.054, 1., 0.];
let x3 = vec![258.589, 616.297, 215.061, 498.361, 0.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2), ("X3", x3)];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let formula = "Y ~ X1 + X2 + X3";
FormulaRegressionBuilder::new()
.data(&data)
.formula(formula)
.fit()
.unwrap()
}
#[test]
fn test_prediction_empty_vectors() {
let model = build_model();
let new_data: HashMap<Cow<'_, _>, _> = vec![("X1", vec![]), ("X2", vec![]), ("X3", vec![])]
.into_iter()
.map(|(x, y)| (Cow::from(x), y))
.collect();
assert!(model.check_variables(&new_data).is_err());
}
#[test]
fn test_prediction_vectors_with_different_lengths() {
let model = build_model();
let new_data: HashMap<Cow<'_, _>, _> = vec![
("X1", vec![1.0, 2.0]),
("X2", vec![2.0, 1.0]),
("X3", vec![3.0]),
]
.into_iter()
.map(|(x, y)| (Cow::from(x), y))
.collect();
assert!(model.check_variables(&new_data).is_err());
}
#[test]
fn test_too_many_prediction_variables() {
let model = build_model();
let new_data: HashMap<Cow<'_, _>, _> = vec![
("X1", vec![1.0]),
("X2", vec![2.0]),
("X3", vec![3.0]),
("X4", vec![4.0]),
]
.into_iter()
.map(|(x, y)| (Cow::from(x), y))
.collect();
assert!(model.check_variables(&new_data).is_err());
}
#[test]
fn test_not_enough_prediction_variables() {
let model = build_model();
let new_data: HashMap<Cow<'_, _>, _> = vec![("X1", vec![1.0]), ("X2", vec![2.0])]
.into_iter()
.map(|(x, y)| (Cow::from(x), y))
.collect();
assert!(model.check_variables(&new_data).is_err());
}
#[test]
fn test_prediction_broken_model() {
let mut model = build_model();
model.parameters.regressor_values = vec![];
let new_data: HashMap<Cow<'_, _>, _> =
vec![("X1", vec![1.0]), ("X2", vec![2.0]), ("X3", vec![3.0])]
.into_iter()
.map(|(x, y)| (Cow::from(x), y))
.collect();
assert!(model.check_variables(&new_data).is_err());
}
#[test]
fn test_prediction() {
let model = build_model();
let new_data = vec![("X1", vec![2.5]), ("X2", vec![2.0]), ("X3", vec![2.0])];
let prediction = model.predict(new_data).unwrap();
assert_eq!(prediction.len(), 1);
assert_almost_equal_with_precision(prediction[0], 3.500000000000111, 1.0E-7);
}
#[test]
fn test_multiple_predictions() {
let model = build_model();
let new_data = vec![
("X1", vec![2.5, 3.5]),
("X2", vec![2.0, 8.0]),
("X3", vec![2.0, 1.0]),
];
let prediction = model.predict(new_data).unwrap();
assert_eq!(prediction.len(), 2);
assert_almost_equal_with_precision(prediction[0], 3.500000000000111, 1.0E-7);
assert_almost_equal_with_precision(prediction[1], 2.5000000000001337, 1.0E-7);
}
#[test]
fn test_multiple_predictions_out_of_order() {
let model = build_model();
let new_data = vec![
("X1", vec![2.5, 3.5]),
("X3", vec![2.0, 1.0]),
("X2", vec![2.0, 8.0]),
];
let prediction = model.predict(new_data).unwrap();
assert_eq!(prediction.len(), 2);
assert_almost_equal_with_precision(prediction[0], 3.500000000000111, 1.0E-7);
assert_almost_equal_with_precision(prediction[1], 2.5000000000001337, 1.0E-7);
}
}
#[cfg(all(feature = "unstable", test))]
mod bench {
use super::*;
extern crate test;
use test::Bencher;
#[bench]
fn bench_with_stats(b: &mut Bencher) {
let y = vec![1., 2., 3., 4., 5.];
let x1 = vec![5., 4., 3., 2., 1.];
let x2 = vec![729.53, 439.0367, 42.054, 1., 0.];
let x3 = vec![258.589, 616.297, 215.061, 498.361, 0.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2), ("X3", x3)];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let formula = "Y ~ X1 + X2 + X3";
b.iter(|| {
FormulaRegressionBuilder::new()
.data(&data)
.formula(formula)
.fit()
});
}
#[bench]
fn bench_data_columns_with_stats(b: &mut Bencher) {
let y = vec![1., 2., 3., 4., 5.];
let x1 = vec![5., 4., 3., 2., 1.];
let x2 = vec![729.53, 439.0367, 42.054, 1., 0.];
let x3 = vec![258.589, 616.297, 215.061, 498.361, 0.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2), ("X3", x3)];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
b.iter(|| {
FormulaRegressionBuilder::new()
.data(&data)
.data_columns("Y", ["X1", "X2", "X3"])
.fit()
});
}
#[bench]
fn bench_without_stats(b: &mut Bencher) {
let y = vec![1., 2., 3., 4., 5.];
let x1 = vec![5., 4., 3., 2., 1.];
let x2 = vec![729.53, 439.0367, 42.054, 1., 0.];
let x3 = vec![258.589, 616.297, 215.061, 498.361, 0.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2), ("X3", x3)];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
let formula = "Y ~ X1 + X2 + X3";
b.iter(|| {
FormulaRegressionBuilder::new()
.data(&data)
.formula(formula)
.fit_without_statistics()
});
}
#[bench]
fn bench_data_columns_without_stats(b: &mut Bencher) {
let y = vec![1., 2., 3., 4., 5.];
let x1 = vec![5., 4., 3., 2., 1.];
let x2 = vec![729.53, 439.0367, 42.054, 1., 0.];
let x3 = vec![258.589, 616.297, 215.061, 498.361, 0.];
let data = vec![("Y", y), ("X1", x1), ("X2", x2), ("X3", x3)];
let data = RegressionDataBuilder::new().build_from(data).unwrap();
b.iter(|| {
FormulaRegressionBuilder::new()
.data(&data)
.data_columns("Y", ["X1", "X2", "X3"])
.fit_without_statistics()
});
}
}