diff --git a/pyhealth/models/logistic_regression.py b/pyhealth/models/logistic_regression.py index 8155d101f..d421427e5 100644 --- a/pyhealth/models/logistic_regression.py +++ b/pyhealth/models/logistic_regression.py @@ -10,22 +10,33 @@ class LogisticRegression(BaseModel): - """Logistic/Linear regression baseline model. + """Logistic/Linear regression baseline model with optional L1 regularization. This model uses embeddings from different input features and applies a single linear transformation (no hidden layers or non-linearity) to produce predictions. - + - For classification tasks: acts as logistic regression - For regression tasks: acts as linear regression - + The model automatically handles different input types through the EmbeddingModel, pools sequence dimensions, concatenates all feature embeddings, and applies a final linear layer. + L1 regularization (``l1_lambda > 0``) adds a sparsity-inducing penalty to the + weight vector during training, equivalent to scikit-learn's + ``LogisticRegression(penalty='l1', C=C)`` with ``l1_lambda = 1 / (C * n_train)``. + This is the formulation used in Boag et al. (2018) "Racial Disparities and + Mistrust in End-of-Life Care" (MLHC 2018) to train interpersonal-feature + mistrust classifiers on MIMIC-III. + Args: dataset: the dataset to train the model. It is used to query certain information such as the set of all tokens. embedding_dim: the embedding dimension. Default is 128. + l1_lambda: coefficient for the L1 weight penalty added to the loss. + ``loss = BCE + l1_lambda * ||W||_1``. Set to 0.0 (default) to + disable regularization (backward-compatible). Equivalent to + ``1 / (C * n_train)`` for sklearn's C-parameterised formulation. **kwargs: other parameters (for compatibility). Examples: @@ -55,7 +66,7 @@ class LogisticRegression(BaseModel): ... dataset_name="test") >>> >>> from pyhealth.models import LogisticRegression - >>> model = LogisticRegression(dataset=dataset) + >>> model = LogisticRegression(dataset=dataset, l1_lambda=1e-4) >>> >>> from pyhealth.datasets import get_dataloader >>> train_loader = get_dataloader(dataset, batch_size=2, shuffle=True) @@ -64,7 +75,7 @@ class LogisticRegression(BaseModel): >>> ret = model(**data_batch) >>> print(ret) { - 'loss': tensor(0.6931, grad_fn=), + 'loss': tensor(0.6931, grad_fn=), 'y_prob': tensor([[0.5123], [0.4987]], grad_fn=), 'y_true': tensor([[1.], @@ -80,10 +91,12 @@ def __init__( self, dataset: SampleDataset, embedding_dim: int = 128, + l1_lambda: float = 0.0, **kwargs, ): super(LogisticRegression, self).__init__(dataset) self.embedding_dim = embedding_dim + self.l1_lambda = l1_lambda assert len(self.label_keys) == 1, "Only one label key is supported" self.label_key = self.label_keys[0] @@ -197,6 +210,10 @@ def forward(self, **kwargs) -> Dict[str, torch.Tensor]: # Obtain y_true, loss, y_prob y_true = kwargs[self.label_key].to(self.device) loss = self.get_loss_function()(logits, y_true) + # L1 regularization on the final linear layer's weights (bias excluded), + # equivalent to sklearn's penalty='l1' with C = 1 / (l1_lambda * n_train). + if self.l1_lambda > 0.0: + loss = loss + self.l1_lambda * self.fc.weight.abs().sum() y_prob = self.prepare_y_prob(logits) results = {