# Born's Classifier SQL

from bornrule.sql import BornClassifierSQL


Warning

This SQL implementation is in beta release. It is compatible with SQLite v3.24.0+ and PostgreSQL 14. Previous versions of PostgreSQL may also work, but they have not been tested.

SQL implementation of Born's Classifier

This class is compatible with SQLite and PostgreSQL. Data items are to be passed as list of dictionaries in the format [{feature: value, ...}, ...]. This classifier is suitable for classification with non-negative feature values. The values are treated as unnormalized probability distributions.

Parameters:

Name Type Description Default
engine Engine or str

SQLAlchemy engine or connection string to connect to the database.

'sqlite:///'
prefix str

The prefix to use for the tables in the database. Instances created with different prefix are independent from each other.

'bc'
type_features TraversibleType

SQLAlchemy type of the features.

String
type_classes TraversibleType

SQLAlchemy type of the classes.

Integer

Attributes:

Name Type Description
db Database

Database class acting as interpreter between python and the database.

Source code in bornrule/sql/born.py
  15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 class BornClassifierSQL: """SQL implementation of Born's Classifier This class is compatible with SQLite and PostgreSQL. Data items are to be passed as list of dictionaries in the format [{feature: value, ...}, ...]. This classifier is suitable for classification with non-negative feature values. The values are treated as unnormalized probability distributions. Parameters ---------- engine : Engine or str [SQLAlchemy engine or connection string](https://docs.sqlalchemy.org/en/14/core/engines.html) to connect to the database. prefix : str The prefix to use for the tables in the database. Instances created with different prefix are independent from each other. type_features : TraversibleType [SQLAlchemy type](https://docs.sqlalchemy.org/en/14/core/type_basics.html#generic-camelcase-types) of the features. type_classes : TraversibleType [SQLAlchemy type](https://docs.sqlalchemy.org/en/14/core/type_basics.html#generic-camelcase-types) of the classes. Attributes ---------- db : Database [Database class](https://github.com/eguidotti/bornrule/blob/main/bornrule/sql/database.py) acting as interpreter between python and the database. """ def __init__(self, engine='sqlite:///', prefix='bc', type_features=String, type_classes=Integer): if isinstance(engine, str): engine = create_engine(engine, echo=False) kwargs = { 'engine': engine, 'prefix': prefix, 'type_features': type_features, 'type_classes': type_classes } slug = engine.url.get_dialect().name if slug == 'sqlite': self.db = SQLite(**kwargs) elif slug == 'postgresql': self.db = PostgreSQL(**kwargs) else: raise ValueError( f"Backend {slug} is not implemented yet. Please open an issue at " f"https://github.com/eguidotti/bornrule/issues " f"to add support for {slug}." ) def get_params(self): """Get parameters. Returns ------- params : dict Model's hyper-parameters a, b, h. """ with self.db.connect() as con: return self.db.read_params(con) def set_params(self, a, b, h): """Set parameters. Parameters ---------- a : float Amplitude. Must be strictly positive. b : float Balance. Must be non-negative. h : float Entropy. Must be non-negative. """ self.db.check_editable() if a <= 0: raise ValueError("The parameter 'a' must be strictly positive.") if b < 0: raise ValueError("The parameter 'b' must be non-negative.") if h < 0: raise ValueError("The parameter 'h' must be non-negative.") with self.db.connect() as con: self.db.write_params(con, a=a, b=b, h=h) def fit(self, X, y, sample_weight=None): """Fit the classifier according to the training data X, y. Parameters ---------- X : list of dict of length n_samples Training data in the format [{feature: value, ...}, ...]. y : list-like of length n_samples List giving the target class for each sample. If a list of dict in the format [{class: value, ...}, ...], then each dict gives the distribution of the classes for each sample (e.g., multi-labeled samples) sample_weight : list-like of length n_samples List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- self : object Returns the instance itself. """ self.db.check_editable() self._validate(X=X, y=y, sample_weight=sample_weight) with self.db.connect() as con: self.db.table_corpus.drop(con, checkfirst=True) return self.partial_fit(X, y, sample_weight=sample_weight) def partial_fit(self, X, y, sample_weight=None): """Incremental fit on a batch of samples. This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. Parameters ---------- X : list of dict of length n_samples Training data in the format [{feature: value, ...}, ...]. y : list-like of length n_samples List giving the target class for each sample. If a list of dict in the format [{class: value, ...}, ...], then each dict gives the distribution of the classes for each sample (e.g., multi-labeled samples) sample_weight : list-like of length n_samples List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- self : object Returns the instance itself. """ self.db.check_editable() self._validate(X=X, y=y, sample_weight=sample_weight) if sample_weight is None: sample_weight = [1] * len(X) with self.db.connect() as con: self.db.write_corpus(con, X=X, y=y, sample_weight=sample_weight) return self def predict(self, X): """Perform classification on the test data X. Parameters ---------- X : list of dict of length n_samples Test data in the format [{feature: value, ...}, ...]. Returns ------- y : list of length n_samples Predicted target classes for X. """ self.db.check_fitted() self._validate(X=X) with self.db.connect() as con: classes = self.db.predict(con, X=X) classes = dict(zip(classes[self.db.FIELD_ITEM], classes[self.db.FIELD_CLASS])) classes = [classes[i] if i in classes else None for i in range(len(X))] return classes def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : list of dict of length n_samples Test data in the format [{feature: value, ...}, ...]. Returns ------- y : DataFrame of shape (n_samples, n_classes) Returns the probability of the samples for each class in the model. """ self.db.check_fitted() self._validate(X=X) with self.db.connect() as con: proba = self.db.predict_proba(con, X=X) proba = self._pivot(proba, index=self.db.FIELD_ITEM, columns=self.db.FIELD_CLASS, values=self.db.FIELD_WEIGHT) proba = proba.reindex(range(len(X))).sparse.to_dense() return proba def explain(self, X=None, sample_weight=None): r"""Global and local explanation For each test vector $x$, the $a$-th power of the unnormalized probability for the $k$-th class is given by the matrix product: math u_k^a = \sum_j W_{jk}x_j^a  where $W$ is a matrix of non-negative weights that generally depends on the model's hyper-parameters ($a$, $b$, $h$). The classification probabilities are obtained by normalizing $u$ such that it sums up to $1$. This method returns global or local feature importance weights, depending on X: - When X is not provided, this method returns the global weights $W$. - When X is a single sample, this method returns a matrix of entries $(j,k)$ where each entry is given by $W_{jk}x_j^a$. - When X contains multiple samples, then the values above are computed for each sample and this method returns their weighted sum. By default, each sample is given unit weight. Parameters ---------- X : list of dict of length n_samples Test data in the format [{feature: value, ...}, ...]. If not provided, then global weights are returned. sample_weight : list-like of length n_samples List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- E : DataFrame of shape (n_features, n_classes) Returns the feature importance for each class in the model. """ self.db.check_fitted() if X is not None: self._validate(X=X, sample_weight=sample_weight) with self.db.connect() as con: W = self.db.explain(con, X=X, sample_weight=sample_weight) return self._pivot(W, index=self.db.FIELD_FEATURE, columns=self.db.FIELD_CLASS, values=self.db.FIELD_WEIGHT) def deploy(self): """Deploy the instance Generate and store the weights that are used for prediction to speed up inference time. A deployed instance cannot be modified. To update a deployed instance, undeploy it first. """ with self.db.connect() as con: self.db.deploy(con) def undeploy(self): """Undeploy the instance Drop the weights that are used for prediction. Weights will be recomputed each time on-the-fly. Useful for development, testing, and incremental fit. """ with self.db.connect() as con: self.db.undeploy(con) @staticmethod def _validate(X, y="no_validation", sample_weight=None): only_X = isinstance(y, str) and y == "no_validation" if not isinstance(X, list): raise ValueError( "X must be a list of dict in the form [{feature: value, ...}, ...]" ) for i, x in enumerate(X): if not isinstance(x, dict): raise ValueError( f"Element {i} of X is not a dict" ) for _, value in x.items(): if value < 0: raise ValueError( f"Element {i} of X contains negative values" ) if sample_weight is not None: if len(X) != len(sample_weight): raise ValueError( "Dimension mismatch. X and sample_weight must have the same length" ) for i, value in enumerate(sample_weight): if value < 0: raise ValueError( f"Element {i} of sample_weight contains negative values" ) if not only_X: if len(X) != len(y): raise ValueError( "Dimension mismatch. X and y must have the same length" ) @staticmethod def _pivot(df, index, columns, values): df[values] = df[values].astype(pd.SparseDtype(float)) df = df.pivot(index=index, columns=columns, values=values) df = df.astype(pd.SparseDtype(float, fill_value=0)) df.rename_axis(None, axis=0, inplace=True) df.rename_axis(None, axis=1, inplace=True) return df 

## get_params()

Get parameters.

Returns:

Name Type Description
params dict

Model's hyper-parameters a, b, h.

Source code in bornrule/sql/born.py
 72 73 74 75 76 77 78 79 80 81 82 def get_params(self): """Get parameters. Returns ------- params : dict Model's hyper-parameters a, b, h. """ with self.db.connect() as con: return self.db.read_params(con) 

## set_params(a, b, h)

Set parameters.

Parameters:

Name Type Description Default
a float

Amplitude. Must be strictly positive.

required
b float

Balance. Must be non-negative.

required
h float

Entropy. Must be non-negative.

required
Source code in bornrule/sql/born.py
  84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def set_params(self, a, b, h): """Set parameters. Parameters ---------- a : float Amplitude. Must be strictly positive. b : float Balance. Must be non-negative. h : float Entropy. Must be non-negative. """ self.db.check_editable() if a <= 0: raise ValueError("The parameter 'a' must be strictly positive.") if b < 0: raise ValueError("The parameter 'b' must be non-negative.") if h < 0: raise ValueError("The parameter 'h' must be non-negative.") with self.db.connect() as con: self.db.write_params(con, a=a, b=b, h=h) 

## fit(X, y, sample_weight=None)

Fit the classifier according to the training data X, y.

Parameters:

Name Type Description Default
X list of dict of length n_samples

Training data in the format [{feature: value, ...}, ...].

required
y list-like of length n_samples

List giving the target class for each sample. If a list of dict in the format [{class: value, ...}, ...], then each dict gives the distribution of the classes for each sample (e.g., multi-labeled samples)

required
sample_weight list-like of length n_samples

List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

None

Returns:

Name Type Description
self object

Returns the instance itself.

Source code in bornrule/sql/born.py
 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 def fit(self, X, y, sample_weight=None): """Fit the classifier according to the training data X, y. Parameters ---------- X : list of dict of length n_samples Training data in the format [{feature: value, ...}, ...]. y : list-like of length n_samples List giving the target class for each sample. If a list of dict in the format [{class: value, ...}, ...], then each dict gives the distribution of the classes for each sample (e.g., multi-labeled samples) sample_weight : list-like of length n_samples List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- self : object Returns the instance itself. """ self.db.check_editable() self._validate(X=X, y=y, sample_weight=sample_weight) with self.db.connect() as con: self.db.table_corpus.drop(con, checkfirst=True) return self.partial_fit(X, y, sample_weight=sample_weight) 

## partial_fit(X, y, sample_weight=None)

Incremental fit on a batch of samples.

This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning.

Parameters:

Name Type Description Default
X list of dict of length n_samples

Training data in the format [{feature: value, ...}, ...].

required
y list-like of length n_samples

List giving the target class for each sample. If a list of dict in the format [{class: value, ...}, ...], then each dict gives the distribution of the classes for each sample (e.g., multi-labeled samples)

required
sample_weight list-like of length n_samples

List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

None

Returns:

Name Type Description
self object

Returns the instance itself.

Source code in bornrule/sql/born.py
 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 def partial_fit(self, X, y, sample_weight=None): """Incremental fit on a batch of samples. This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. Parameters ---------- X : list of dict of length n_samples Training data in the format [{feature: value, ...}, ...]. y : list-like of length n_samples List giving the target class for each sample. If a list of dict in the format [{class: value, ...}, ...], then each dict gives the distribution of the classes for each sample (e.g., multi-labeled samples) sample_weight : list-like of length n_samples List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- self : object Returns the instance itself. """ self.db.check_editable() self._validate(X=X, y=y, sample_weight=sample_weight) if sample_weight is None: sample_weight = [1] * len(X) with self.db.connect() as con: self.db.write_corpus(con, X=X, y=y, sample_weight=sample_weight) return self 

## predict(X)

Perform classification on the test data X.

Parameters:

Name Type Description Default
X list of dict of length n_samples

Test data in the format [{feature: value, ...}, ...].

required

Returns:

Name Type Description
y list of length n_samples

Predicted target classes for X.

Source code in bornrule/sql/born.py
 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 def predict(self, X): """Perform classification on the test data X. Parameters ---------- X : list of dict of length n_samples Test data in the format [{feature: value, ...}, ...]. Returns ------- y : list of length n_samples Predicted target classes for X. """ self.db.check_fitted() self._validate(X=X) with self.db.connect() as con: classes = self.db.predict(con, X=X) classes = dict(zip(classes[self.db.FIELD_ITEM], classes[self.db.FIELD_CLASS])) classes = [classes[i] if i in classes else None for i in range(len(X))] return classes 

## predict_proba(X)

Return probability estimates for the test data X.

Parameters:

Name Type Description Default
X list of dict of length n_samples

Test data in the format [{feature: value, ...}, ...].

required

Returns:

Name Type Description
y DataFrame of shape (n_samples, n_classes)

Returns the probability of the samples for each class in the model.

Source code in bornrule/sql/born.py
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : list of dict of length n_samples Test data in the format [{feature: value, ...}, ...]. Returns ------- y : DataFrame of shape (n_samples, n_classes) Returns the probability of the samples for each class in the model. """ self.db.check_fitted() self._validate(X=X) with self.db.connect() as con: proba = self.db.predict_proba(con, X=X) proba = self._pivot(proba, index=self.db.FIELD_ITEM, columns=self.db.FIELD_CLASS, values=self.db.FIELD_WEIGHT) proba = proba.reindex(range(len(X))).sparse.to_dense() return proba 

## explain(X=None, sample_weight=None)

Global and local explanation

For each test vector $x$, the $a$-th power of the unnormalized probability for the $k$-th class is given by the matrix product:

$u_k^a = \sum_j W_{jk}x_j^a$

where $W$ is a matrix of non-negative weights that generally depends on the model's hyper-parameters ($a$, $b$, $h$). The classification probabilities are obtained by normalizing $u$ such that it sums up to $1$.

This method returns global or local feature importance weights, depending on X:

• When X is not provided, this method returns the global weights $W$.

• When X is a single sample, this method returns a matrix of entries $(j,k)$ where each entry is given by $W_{jk}x_j^a$.

• When X contains multiple samples, then the values above are computed for each sample and this method returns their weighted sum. By default, each sample is given unit weight.

Parameters:

Name Type Description Default
X list of dict of length n_samples

Test data in the format [{feature: value, ...}, ...]. If not provided, then global weights are returned.

None
sample_weight list-like of length n_samples

List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight.

None

Returns:

Name Type Description
E DataFrame of shape (n_features, n_classes)

Returns the feature importance for each class in the model.

Source code in bornrule/sql/born.py
 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 def explain(self, X=None, sample_weight=None): r"""Global and local explanation For each test vector $x$, the $a$-th power of the unnormalized probability for the $k$-th class is given by the matrix product: math u_k^a = \sum_j W_{jk}x_j^a  where $W$ is a matrix of non-negative weights that generally depends on the model's hyper-parameters ($a$, $b$, $h$). The classification probabilities are obtained by normalizing $u$ such that it sums up to $1$. This method returns global or local feature importance weights, depending on X: - When X is not provided, this method returns the global weights $W$. - When X is a single sample, this method returns a matrix of entries $(j,k)$ where each entry is given by $W_{jk}x_j^a$. - When X contains multiple samples, then the values above are computed for each sample and this method returns their weighted sum. By default, each sample is given unit weight. Parameters ---------- X : list of dict of length n_samples Test data in the format [{feature: value, ...}, ...]. If not provided, then global weights are returned. sample_weight : list-like of length n_samples List of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- E : DataFrame of shape (n_features, n_classes) Returns the feature importance for each class in the model. """ self.db.check_fitted() if X is not None: self._validate(X=X, sample_weight=sample_weight) with self.db.connect() as con: W = self.db.explain(con, X=X, sample_weight=sample_weight) return self._pivot(W, index=self.db.FIELD_FEATURE, columns=self.db.FIELD_CLASS, values=self.db.FIELD_WEIGHT) 

## deploy()

Deploy the instance

Generate and store the weights that are used for prediction to speed up inference time. A deployed instance cannot be modified. To update a deployed instance, undeploy it first.

Source code in bornrule/sql/born.py
 272 273 274 275 276 277 278 279 280 def deploy(self): """Deploy the instance Generate and store the weights that are used for prediction to speed up inference time. A deployed instance cannot be modified. To update a deployed instance, undeploy it first. """ with self.db.connect() as con: self.db.deploy(con) 

## undeploy()

Undeploy the instance

Drop the weights that are used for prediction. Weights will be recomputed each time on-the-fly. Useful for development, testing, and incremental fit.

Source code in bornrule/sql/born.py
 282 283 284 285 286 287 288 289 290 def undeploy(self): """Undeploy the instance Drop the weights that are used for prediction. Weights will be recomputed each time on-the-fly. Useful for development, testing, and incremental fit. """ with self.db.connect() as con: self.db.undeploy(con)