Skip to content

Vanillix Module

Vanillix

Bases: BasePipeline

Vanillix specific version of the BasePipeline class.

Inherits preprocess, fit, predict, evaluate, and visualize methods from BasePipeline. This class extends BasePipeline. See the parent class for a full list of attributes and methods.

Additional Attributes

_default_config: Is set to VanillixConfig here.

Source code in src/autoencodix/vanillix.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class Vanillix(BasePipeline):
    """Vanillix specific version of the BasePipeline class.

    Inherits preprocess, fit, predict, evaluate, and visualize methods from BasePipeline.
    This class extends BasePipeline. See the parent class for a full list
    of attributes and methods.

    Additional Attributes:
        _default_config: Is set to VanillixConfig here.

    """

    def __init__(
        self,
        data: Optional[Union[DataPackage, DatasetContainer]] = None,
        trainer_type: Type[BaseTrainer] = GeneralTrainer,
        dataset_type: Type[BaseDataset] = NumericDataset,
        model_type: Type[BaseAutoencoder] = VanillixArchitecture,
        loss_type: Type[BaseLoss] = VanillixLoss,
        preprocessor_type: Type[BasePreprocessor] = GeneralPreprocessor,
        visualizer: Type[BaseVisualizer] = GeneralVisualizer,
        evaluator: Optional[Type[BaseEvaluator]] = GeneralEvaluator,
        result: Optional[Result] = None,
        datasplitter_type: Type[DataSplitter] = DataSplitter,
        custom_splits: Optional[Dict[str, np.ndarray]] = None,
        config: Optional[DefaultConfig] = None,
        ontologies: Optional[Union[List, Dict]] = None,
    ) -> None:
        """Initialize Vanillix pipeline with customizable components.

        Some components are passed as types rather than instances because they require
        data that is only available after preprocessing.

        See implementation of parent class for list of full Args.
        """
        self._default_config = VanillixConfig()
        super().__init__(
            data=data,
            dataset_type=dataset_type,
            trainer_type=trainer_type,
            model_type=model_type,
            loss_type=loss_type,
            preprocessor_type=preprocessor_type,
            visualizer=visualizer,
            evaluator=evaluator,
            result=result,
            datasplitter_type=datasplitter_type,
            config=config,
            custom_split=custom_splits,
            ontologies=ontologies,
        )

    def sample_latent_space(
        self,
        n_samples: int,
        split: str = "test",
        epoch: int = -1,
    ) -> torch.Tensor:
        """Samples latent space points from the empirical latent distribution.

        This method draws new latent points by fitting a diagonal Gaussian
        distribution to the latent codes of the specified split and epoch, and
        sampling from it. This enables approximate generative sampling for
        autoencoders that do not model uncertainty explicitly.

        Args:
            n_samples: The number of latent points to sample. Must be a positive
                integer.
            split: The split to sample from (train, valid, test), default is test.
            epoch: The epoch to sample from, default is the last epoch (-1).

        Returns:
            z: torch.Tensor - The sampled latent space points.

        Raises:
            ValueError: If the model has not been trained, latent codes have not
                been computed, or n_samples is not a positive integer.
            TypeError: If the stored latent codes are not numpy arrays.
        """

        if not hasattr(self, "_trainer") or self._trainer is None:
            raise ValueError("Model is not trained yet. Please train the model first.")
        if self.result.latentspaces is None:
            raise ValueError("Model has no stored latent codes for sampling.")
        if not isinstance(n_samples, int) or n_samples <= 0:
            raise ValueError("n_samples must be a positive integer.")

        Z = self.result.latentspaces.get(split=split, epoch=epoch)

        if not isinstance(Z, np.ndarray):
            raise TypeError(
                f"Expected latent codes to be of type numpy.ndarray, got {type(Z)}."
            )

        Z_t = torch.from_numpy(Z).to(
            device=self._trainer._model.device,
            dtype=self._trainer._model.dtype,
        )

        with torch.no_grad():
            # Fit empirical diagonal Gaussian
            global_mu = Z_t.mean(dim=0)
            global_std = Z_t.std(dim=0)

            eps = torch.randn(
                n_samples,
                Z_t.shape[1],
                device=Z_t.device,
                dtype=Z_t.dtype,
            )

            z = global_mu + eps * global_std
            return z

__init__(data=None, trainer_type=GeneralTrainer, dataset_type=NumericDataset, model_type=VanillixArchitecture, loss_type=VanillixLoss, preprocessor_type=GeneralPreprocessor, visualizer=GeneralVisualizer, evaluator=GeneralEvaluator, result=None, datasplitter_type=DataSplitter, custom_splits=None, config=None, ontologies=None)

Initialize Vanillix pipeline with customizable components.

Some components are passed as types rather than instances because they require data that is only available after preprocessing.

See implementation of parent class for list of full Args.

Source code in src/autoencodix/vanillix.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    data: Optional[Union[DataPackage, DatasetContainer]] = None,
    trainer_type: Type[BaseTrainer] = GeneralTrainer,
    dataset_type: Type[BaseDataset] = NumericDataset,
    model_type: Type[BaseAutoencoder] = VanillixArchitecture,
    loss_type: Type[BaseLoss] = VanillixLoss,
    preprocessor_type: Type[BasePreprocessor] = GeneralPreprocessor,
    visualizer: Type[BaseVisualizer] = GeneralVisualizer,
    evaluator: Optional[Type[BaseEvaluator]] = GeneralEvaluator,
    result: Optional[Result] = None,
    datasplitter_type: Type[DataSplitter] = DataSplitter,
    custom_splits: Optional[Dict[str, np.ndarray]] = None,
    config: Optional[DefaultConfig] = None,
    ontologies: Optional[Union[List, Dict]] = None,
) -> None:
    """Initialize Vanillix pipeline with customizable components.

    Some components are passed as types rather than instances because they require
    data that is only available after preprocessing.

    See implementation of parent class for list of full Args.
    """
    self._default_config = VanillixConfig()
    super().__init__(
        data=data,
        dataset_type=dataset_type,
        trainer_type=trainer_type,
        model_type=model_type,
        loss_type=loss_type,
        preprocessor_type=preprocessor_type,
        visualizer=visualizer,
        evaluator=evaluator,
        result=result,
        datasplitter_type=datasplitter_type,
        config=config,
        custom_split=custom_splits,
        ontologies=ontologies,
    )

sample_latent_space(n_samples, split='test', epoch=-1)

Samples latent space points from the empirical latent distribution.

This method draws new latent points by fitting a diagonal Gaussian distribution to the latent codes of the specified split and epoch, and sampling from it. This enables approximate generative sampling for autoencoders that do not model uncertainty explicitly.

Parameters:

Name Type Description Default
n_samples int

The number of latent points to sample. Must be a positive integer.

required
split str

The split to sample from (train, valid, test), default is test.

'test'
epoch int

The epoch to sample from, default is the last epoch (-1).

-1

Returns:

Name Type Description
z Tensor

torch.Tensor - The sampled latent space points.

Raises:

Type Description
ValueError

If the model has not been trained, latent codes have not been computed, or n_samples is not a positive integer.

TypeError

If the stored latent codes are not numpy arrays.

Source code in src/autoencodix/vanillix.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def sample_latent_space(
    self,
    n_samples: int,
    split: str = "test",
    epoch: int = -1,
) -> torch.Tensor:
    """Samples latent space points from the empirical latent distribution.

    This method draws new latent points by fitting a diagonal Gaussian
    distribution to the latent codes of the specified split and epoch, and
    sampling from it. This enables approximate generative sampling for
    autoencoders that do not model uncertainty explicitly.

    Args:
        n_samples: The number of latent points to sample. Must be a positive
            integer.
        split: The split to sample from (train, valid, test), default is test.
        epoch: The epoch to sample from, default is the last epoch (-1).

    Returns:
        z: torch.Tensor - The sampled latent space points.

    Raises:
        ValueError: If the model has not been trained, latent codes have not
            been computed, or n_samples is not a positive integer.
        TypeError: If the stored latent codes are not numpy arrays.
    """

    if not hasattr(self, "_trainer") or self._trainer is None:
        raise ValueError("Model is not trained yet. Please train the model first.")
    if self.result.latentspaces is None:
        raise ValueError("Model has no stored latent codes for sampling.")
    if not isinstance(n_samples, int) or n_samples <= 0:
        raise ValueError("n_samples must be a positive integer.")

    Z = self.result.latentspaces.get(split=split, epoch=epoch)

    if not isinstance(Z, np.ndarray):
        raise TypeError(
            f"Expected latent codes to be of type numpy.ndarray, got {type(Z)}."
        )

    Z_t = torch.from_numpy(Z).to(
        device=self._trainer._model.device,
        dtype=self._trainer._model.dtype,
    )

    with torch.no_grad():
        # Fit empirical diagonal Gaussian
        global_mu = Z_t.mean(dim=0)
        global_std = Z_t.std(dim=0)

        eps = torch.randn(
            n_samples,
            Z_t.shape[1],
            device=Z_t.device,
            dtype=Z_t.dtype,
        )

        z = global_mu + eps * global_std
        return z