Source code for causalcompass.datasets.mixed_data

# Portions of this file are adapted from MiTCD
# Copyright (c) 2025 ChunhuiZhao
# https://github.com/chunhuiz/MiTCD
# Licensed under the MIT License
"""
Mixed data generation for VAR and Lorenz 96 datasets.

Reference:
    [1] https://github.com/chunhuiz/MiTCD/blob/main/data_process.py
"""

import numpy as np
import torch
import random
import os
from .vanilla import simulate_var, simulate_lorenz_96
from sklearn import preprocessing
from copy import deepcopy


[docs] def generate_mixed_var(p, T, lag=3, sparsity=0.2, beta_value=1.0, sd=0.1, burn_in=100, typeflag=None, discrete_ratio=0.5, seed=0, length_per_batch=50, device=torch.device('cuda:0')): """ Generate VAR data where a proportion of variables are discretized. References ---------- https://github.com/chunhuiz/MiTCD Parameters ---------- p : int Number of variables T : int Number of time points lag : int, default 3 Number of lags in the VAR model sparsity : float, default 0.2 Sparsity of the causal graph beta_value : float, default 1.0 Coefficient value sd : float, default 0.1 Noise standard deviation burn_in : int, default 100 Burn-in period typeflag : list or None, default None Manual specification of variable types (0=discrete, 1=continuous). If None, automatically generated based on discrete_ratio discrete_ratio : float, default 0.5 Ratio of discrete variables seed : int, default 0 Random seed length_per_batch : int, default 50 Length of each batch for instance normalization device : torch.device, default torch.device('cuda:0') Device for tensor computation Returns ------- tuple (data_bin_global, data_bin_inst, GC) — globally binarized time series of shape (T, p), instance-normalized and binarized time series of shape (num_batches, length_per_batch, p), and ground-truth causal graph of shape (p, p). data_bin_inst is used as input for the MiTCD algorithm, while all other algorithms use data_bin_global as input. """ if seed is not None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if typeflag is None: num_discrete = int(round(p * discrete_ratio)) discrete_indices = np.random.choice(p, size=num_discrete, replace=False) typeflag = [0 if i in discrete_indices else 1 for i in range(p)] X_np, beta, GC = simulate_var(p=p, T=T, lag=lag, sparsity=sparsity, beta_value=beta_value, sd=sd, burn_in=burn_in, seed=seed) min_max_scaler = preprocessing.MinMaxScaler() X_scaled = min_max_scaler.fit_transform(X_np) X_scaled_bin = X_scaled.copy() for i, flag in enumerate(typeflag): if flag == 0: X_scaled_bin[:, i] = X_scaled_bin[:, i] > 0.5 X_pre = torch.tensor(X_scaled[np.newaxis], dtype=torch.float32).reshape(-1, length_per_batch, p).numpy() X_real = np.zeros_like(X_pre) for i in range(X_real.shape[0]): for j in range(p): instance = X_pre[i, :, j] instance_norm = (instance - np.min(instance)) / (np.max(instance) - np.min(instance)) X_real[i, :, j] = instance_norm X_real_tensor = torch.tensor(X_real, dtype=torch.float32, device=device) X_bin_inst = deepcopy(X_real_tensor) for i in range(p): if typeflag[i] == 0: X_bin_inst[:, :, i] = X_bin_inst[:, :, i] > 0.5 return X_scaled_bin.astype(np.float32), X_bin_inst.cpu().numpy(), GC
[docs] def generate_mixed_lorenz_96(p, T, F=10.0, delta_t=0.1, sd=0.1, burn_in=1000, typeflag=None, discrete_ratio=0.5, seed=0, length_per_batch=50, device=torch.device('cuda:0')): """ Generate Lorenz-96 data where a proportion of variables are discretized. References ---------- https://github.com/chunhuiz/MiTCD Parameters ---------- p : int Number of variables T : int Number of time points F : float, default 10.0 Forcing parameter delta_t : float, default 0.1 Time step for ODE solver sd : float, default 0.1 Noise standard deviation burn_in : int, default 1000 Burn-in period typeflag : list or None, default None Manual specification of variable types (0=discrete, 1=continuous). If None, automatically generated based on discrete_ratio discrete_ratio : float, default 0.5 Ratio of discrete variables seed : int, default 0 Random seed length_per_batch : int, default 50 Length of each batch for instance normalization device : torch.device, default torch.device('cuda:0') Device for tensor computation Returns ------- tuple (data_bin_global, data_bin_inst, GC) — globally binarized time series of shape (T, p), instance-normalized and binarized time series of shape (num_batches, length_per_batch, p), and ground-truth causal graph of shape (p, p). data_bin_inst is used as input for the MiTCD algorithm, while all other algorithms use data_bin_global as input. """ if seed is not None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if typeflag is None: num_discrete = int(round(p * discrete_ratio)) discrete_indices = np.random.choice(p, size=num_discrete, replace=False) typeflag = [0 if i in discrete_indices else 1 for i in range(p)] X_np, GC = simulate_lorenz_96(p=p, T=T, F=F, delta_t=delta_t, sd=sd, burn_in=burn_in, seed=seed) min_max_scaler = preprocessing.MinMaxScaler() X_scaled = min_max_scaler.fit_transform(X_np) X_scaled_bin = X_scaled.copy() for i, flag in enumerate(typeflag): if flag == 0: X_scaled_bin[:, i] = X_scaled_bin[:, i] > 0.5 # instance norm X_pre = torch.tensor(X_scaled[np.newaxis], dtype=torch.float32).reshape(-1, length_per_batch, p).numpy() X_real = np.zeros_like(X_pre) for i in range(X_real.shape[0]): for j in range(p): instance = X_pre[i, :, j] instance_norm = (instance - np.min(instance)) / (np.max(instance) - np.min(instance)) X_real[i, :, j] = instance_norm X_real_tensor = torch.tensor(X_real, dtype=torch.float32, device=device) X_bin_inst = deepcopy(X_real_tensor) for i in range(p): if typeflag[i] == 0: X_bin_inst[:, :, i] = X_bin_inst[:, :, i] > 0.5 return X_scaled_bin.astype(np.float32), X_bin_inst.cpu().numpy(), GC