Yinuo Zhang
commited on
Commit
·
d79b4f8
1
Parent(s):
724b6d6
Add large file using Git LFS
Browse files- configs/config.yaml +171 -0
- configs/path.yaml +7 -0
- main.py +27 -26
- peptune-pretrained.ckpt +3 -0
configs/config.yaml
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
- path
|
| 3 |
+
|
| 4 |
+
noise:
|
| 5 |
+
type: loglinear
|
| 6 |
+
sigma_min: 1e-4
|
| 7 |
+
sigma_max: 20
|
| 8 |
+
state_dependent: True
|
| 9 |
+
|
| 10 |
+
mode: ppl_eval # train / ppl_eval / sample_eval
|
| 11 |
+
diffusion: absorbing_state
|
| 12 |
+
vocab: old_smiles # old_smiles / new_smiles / selfies / helm
|
| 13 |
+
backbone: roformer # peptideclm / helmgpt / dit / roformer / finetune_roformer
|
| 14 |
+
parameterization: subs # subs
|
| 15 |
+
time_conditioning: False
|
| 16 |
+
T: 0 # 0 (continuous time) / 1000
|
| 17 |
+
subs_masking: False
|
| 18 |
+
|
| 19 |
+
seed: 42
|
| 20 |
+
|
| 21 |
+
mcts:
|
| 22 |
+
num_children: 50
|
| 23 |
+
num_objectives: 5
|
| 24 |
+
topk: 100
|
| 25 |
+
mask_token: 4
|
| 26 |
+
num_iter: 128
|
| 27 |
+
sampling: 0 # 0 is gumbel sampling / > 0 samples children from top k probs
|
| 28 |
+
invalid_penalty: 0.5
|
| 29 |
+
sample_prob: 1.0
|
| 30 |
+
perm: True
|
| 31 |
+
dual: False
|
| 32 |
+
single: False
|
| 33 |
+
time_dependent: True
|
| 34 |
+
|
| 35 |
+
lr_scheduler:
|
| 36 |
+
_target_: transformers.get_constant_schedule_with_warmup
|
| 37 |
+
num_warmup_steps: 2500
|
| 38 |
+
|
| 39 |
+
data:
|
| 40 |
+
train: ${paths.data}/finetune2/30K-train.csv
|
| 41 |
+
valid: ${paths.data}/finetune2/30K-val.csv
|
| 42 |
+
batchinohup ng: wrapping # padding / wrapping
|
| 43 |
+
|
| 44 |
+
loader:
|
| 45 |
+
global_batch_size: 64
|
| 46 |
+
eval_global_batch_size: ${.global_batch_size}
|
| 47 |
+
# Note: batch_size and eval_batch_size are **per machine**
|
| 48 |
+
batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
|
| 49 |
+
eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
|
| 50 |
+
num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
|
| 51 |
+
pin_memory: True
|
| 52 |
+
|
| 53 |
+
sampling:
|
| 54 |
+
predictor: ddpm_cache # analytic, ddpm, ddpm_cache
|
| 55 |
+
num_sequences: 100
|
| 56 |
+
sampling_eps: 1e-3
|
| 57 |
+
steps: 128
|
| 58 |
+
seq_length: 100
|
| 59 |
+
noise_removal: True
|
| 60 |
+
num_sample_batches: 2 # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
|
| 61 |
+
num_sample_log: 2
|
| 62 |
+
stride_length: 1
|
| 63 |
+
num_strides: 1
|
| 64 |
+
|
| 65 |
+
training:
|
| 66 |
+
antithetic_sampling: True
|
| 67 |
+
sampling_eps: 1e-3
|
| 68 |
+
focus_mask: False
|
| 69 |
+
#dynamic_batching: True
|
| 70 |
+
accumulator: False
|
| 71 |
+
|
| 72 |
+
eval:
|
| 73 |
+
checkpoint_path: ${paths.checkpoints}/11M-old-tokenizer/epoch=10-step=156276.ckpt
|
| 74 |
+
disable_ema: False
|
| 75 |
+
compute_generative_perplexity: False
|
| 76 |
+
perplexity_batch_size: 8
|
| 77 |
+
compute_perplexity_on_sanity: False
|
| 78 |
+
gen_ppl_eval_model_name_or_path: gpt2-large # gpt2-large, meta-llama/Llama-2-7b-hf
|
| 79 |
+
generate_samples: True
|
| 80 |
+
generation_model: ${paths.checkpoints}/11M-old-tokenizer/
|
| 81 |
+
|
| 82 |
+
optim:
|
| 83 |
+
weight_decay: 0.075
|
| 84 |
+
lr: 3e-4
|
| 85 |
+
beta1: 0.9
|
| 86 |
+
beta2: 0.999
|
| 87 |
+
eps: 1e-8
|
| 88 |
+
|
| 89 |
+
pepclm:
|
| 90 |
+
hidden_size: 768
|
| 91 |
+
cond_dim: 256
|
| 92 |
+
n_heads: 20
|
| 93 |
+
n_blocks: 4
|
| 94 |
+
dropout: 0.5
|
| 95 |
+
length: 512
|
| 96 |
+
#scale_by_sigma: True
|
| 97 |
+
|
| 98 |
+
model:
|
| 99 |
+
type: ddit
|
| 100 |
+
hidden_size: 768
|
| 101 |
+
cond_dim: 128
|
| 102 |
+
length: 512
|
| 103 |
+
n_blocks: 12
|
| 104 |
+
n_heads: 12
|
| 105 |
+
scale_by_sigma: True
|
| 106 |
+
dropout: 0.1
|
| 107 |
+
|
| 108 |
+
roformer:
|
| 109 |
+
hidden_size: 768
|
| 110 |
+
n_layers: 8
|
| 111 |
+
n_heads: 8
|
| 112 |
+
max_position_embeddings: 1035
|
| 113 |
+
|
| 114 |
+
helmgpt:
|
| 115 |
+
hidden_size: 256
|
| 116 |
+
embd_pdrop: 0.1
|
| 117 |
+
resid_pdrop: 0.1
|
| 118 |
+
attn_pdrop: 0.1
|
| 119 |
+
ff_dropout: 0.
|
| 120 |
+
block_size: 140
|
| 121 |
+
n_layer: 8
|
| 122 |
+
n_heads: 8
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
trainer:
|
| 126 |
+
_target_: lightning.Trainer
|
| 127 |
+
accelerator: cuda
|
| 128 |
+
num_nodes: 1
|
| 129 |
+
devices: ${device_count:}
|
| 130 |
+
accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
|
| 131 |
+
gradient_clip_val: 1.0
|
| 132 |
+
precision: 64-true
|
| 133 |
+
num_sanity_val_steps: 2
|
| 134 |
+
max_epochs: 100
|
| 135 |
+
max_steps: 1_000_000
|
| 136 |
+
log_every_n_steps: 10
|
| 137 |
+
limit_train_batches: 1.0 # train on full dataset, can be used to toggle quick run
|
| 138 |
+
limit_val_batches: 1.0 # validate on full dataset, can be used to toggle quick run
|
| 139 |
+
#val_check_interval: 40 #954
|
| 140 |
+
check_val_every_n_epoch: 1
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
wandb:
|
| 144 |
+
project: ${env_or:WANDB_PROJECT,peptune}
|
| 145 |
+
notes: null
|
| 146 |
+
group: null
|
| 147 |
+
job_type: null
|
| 148 |
+
name: ${env_or:WANDB_RUN_NAME,local}
|
| 149 |
+
id: ${.name}
|
| 150 |
+
|
| 151 |
+
hydra:
|
| 152 |
+
run:
|
| 153 |
+
dir: ./${now:%Y.%m.%d}/
|
| 154 |
+
job:
|
| 155 |
+
chdir: True
|
| 156 |
+
|
| 157 |
+
checkpointing:
|
| 158 |
+
# Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
|
| 159 |
+
save_dir: ${paths.outputs}
|
| 160 |
+
# Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
|
| 161 |
+
resume_from_ckpt: True
|
| 162 |
+
resume_ckpt_path: ${paths.checkpoints}/last.ckpt
|
| 163 |
+
|
| 164 |
+
callbacks:
|
| 165 |
+
model_checkpoint:
|
| 166 |
+
_target_: pytorch_lightning.callbacks.ModelCheckpoint
|
| 167 |
+
every_n_epochs: 1
|
| 168 |
+
monitor: "val/nll"
|
| 169 |
+
save_top_k: 10
|
| 170 |
+
mode: "min"
|
| 171 |
+
dirpath: ${paths.checkpoints}/11M-old-tokenizer
|
configs/path.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
paths:
|
| 2 |
+
base: ${cwd:} # repo root
|
| 3 |
+
data: ${paths.base}/data
|
| 4 |
+
checkpoints: ${paths.base}/checkpoints
|
| 5 |
+
tokenizers: ${paths.base}/tokenizers
|
| 6 |
+
outputs: ${paths.base}/outputs
|
| 7 |
+
|
main.py
CHANGED
|
@@ -16,7 +16,6 @@ import torch
|
|
| 16 |
import sys
|
| 17 |
import torch.distributed as dist
|
| 18 |
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 19 |
-
sys.path.append("/home/st512/peptune/scripts/peptide-mdlm-mcts")
|
| 20 |
|
| 21 |
import dataset as dataloader
|
| 22 |
import dataloading_for_dynamic_batching as dynamic_dataloader
|
|
@@ -30,24 +29,25 @@ from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
|
|
| 30 |
from helm_tokenizer.helm_tokenizer import HelmTokenizer
|
| 31 |
|
| 32 |
|
| 33 |
-
#wandb.login(key="5a7613c531cb58f9802f3f8e2f73bc4997b917ab")
|
| 34 |
-
|
| 35 |
omegaconf.OmegaConf.register_new_resolver('cwd', os.getcwd)
|
| 36 |
omegaconf.OmegaConf.register_new_resolver('device_count', torch.cuda.device_count)
|
| 37 |
omegaconf.OmegaConf.register_new_resolver('eval', eval)
|
| 38 |
omegaconf.OmegaConf.register_new_resolver('div_up', lambda x, y: (x + y - 1) // y)
|
|
|
|
| 39 |
|
| 40 |
def _load_from_checkpoint(config, tokenizer):
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
|
| 52 |
@L.pytorch.utilities.rank_zero_only
|
| 53 |
def print_config(
|
|
@@ -197,36 +197,37 @@ def _train(config, logger, tokenizer, data_module):
|
|
| 197 |
|
| 198 |
model = Diffusion(config, tokenizer=tokenizer)
|
| 199 |
|
| 200 |
-
if config.backbone == 'finetune_roformer':
|
| 201 |
-
checkpoint = torch.load(
|
| 202 |
-
|
|
|
|
| 203 |
|
| 204 |
trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
|
| 205 |
|
| 206 |
|
| 207 |
-
@hydra.main(version_base=None, config_path='
|
| 208 |
def main(config):
|
| 209 |
"""
|
| 210 |
Main entry point for training
|
| 211 |
"""
|
| 212 |
-
wandb.init(project="peptune")
|
| 213 |
L.seed_everything(config.seed)
|
| 214 |
|
| 215 |
# print_config(config, resolve=True, save_cfg=True)
|
| 216 |
|
| 217 |
logger = utils.get_logger(__name__)
|
| 218 |
# load PeptideCLM tokenizer
|
| 219 |
-
|
|
|
|
| 220 |
tokenizer = APETokenizer()
|
| 221 |
-
tokenizer.load_vocabulary('/
|
| 222 |
elif config.vocab == 'old_smiles':
|
| 223 |
-
tokenizer = SMILES_SPE_Tokenizer('/
|
| 224 |
-
'/
|
| 225 |
elif config.vocab == 'selfies':
|
| 226 |
tokenizer = APETokenizer()
|
| 227 |
-
tokenizer.load_vocabulary('/
|
| 228 |
elif config.vocab == 'helm':
|
| 229 |
-
tokenizer = HelmTokenizer('/
|
| 230 |
|
| 231 |
if config.backbone == 'finetune_roformer':
|
| 232 |
train_dataset = load_dataset('csv', data_files=config.data.train)
|
|
@@ -236,7 +237,7 @@ def main(config):
|
|
| 236 |
val_dataset = val_dataset['train']#.select(lst)
|
| 237 |
data_module = dataloader.CustomDataModule(train_dataset, val_dataset, None, tokenizer, batch_size=config.loader.global_batch_size)
|
| 238 |
else:
|
| 239 |
-
data_module = dynamic_dataloader.CustomDataModule('
|
| 240 |
|
| 241 |
if config.mode == 'sample_eval':
|
| 242 |
generate_samples(config, logger, tokenizer)
|
|
@@ -247,4 +248,4 @@ def main(config):
|
|
| 247 |
|
| 248 |
|
| 249 |
if __name__ == '__main__':
|
| 250 |
-
main()
|
|
|
|
| 16 |
import sys
|
| 17 |
import torch.distributed as dist
|
| 18 |
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
|
|
| 19 |
|
| 20 |
import dataset as dataloader
|
| 21 |
import dataloading_for_dynamic_batching as dynamic_dataloader
|
|
|
|
| 29 |
from helm_tokenizer.helm_tokenizer import HelmTokenizer
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
| 32 |
omegaconf.OmegaConf.register_new_resolver('cwd', os.getcwd)
|
| 33 |
omegaconf.OmegaConf.register_new_resolver('device_count', torch.cuda.device_count)
|
| 34 |
omegaconf.OmegaConf.register_new_resolver('eval', eval)
|
| 35 |
omegaconf.OmegaConf.register_new_resolver('div_up', lambda x, y: (x + y - 1) // y)
|
| 36 |
+
omegaconf.OmegaConf.register_new_resolver("env_or", lambda k, d: os.getenv(k, d))
|
| 37 |
|
| 38 |
def _load_from_checkpoint(config, tokenizer):
|
| 39 |
+
"""Create Diffusion model; load weights if checkpoint_path is set."""
|
| 40 |
+
if "hf" in str(config.get("backbone", "")):
|
| 41 |
+
return Diffusion(config, tokenizer=tokenizer).to("cuda")
|
| 42 |
+
|
| 43 |
+
ckpt_path = config.eval.checkpoint_path
|
| 44 |
+
model = Diffusion.load_from_checkpoint(
|
| 45 |
+
ckpt_path,
|
| 46 |
+
tokenizer=tokenizer,
|
| 47 |
+
config=config,
|
| 48 |
+
map_location="cuda" if torch.cuda.is_available() else "cpu",
|
| 49 |
+
)
|
| 50 |
+
return model
|
| 51 |
|
| 52 |
@L.pytorch.utilities.rank_zero_only
|
| 53 |
def print_config(
|
|
|
|
| 197 |
|
| 198 |
model = Diffusion(config, tokenizer=tokenizer)
|
| 199 |
|
| 200 |
+
if config.backbone == 'finetune_roformer' and config.eval.checkpoint_path:
|
| 201 |
+
checkpoint = torch.load(config.eval.checkpoint_path, map_location="cpu")
|
| 202 |
+
state = checkpoint.get("state_dict", checkpoint)
|
| 203 |
+
model.load_state_dict(state, strict=False)
|
| 204 |
|
| 205 |
trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
|
| 206 |
|
| 207 |
|
| 208 |
+
@hydra.main(version_base=None, config_path='configs', config_name='config')
|
| 209 |
def main(config):
|
| 210 |
"""
|
| 211 |
Main entry point for training
|
| 212 |
"""
|
|
|
|
| 213 |
L.seed_everything(config.seed)
|
| 214 |
|
| 215 |
# print_config(config, resolve=True, save_cfg=True)
|
| 216 |
|
| 217 |
logger = utils.get_logger(__name__)
|
| 218 |
# load PeptideCLM tokenizer
|
| 219 |
+
tok_dir = config.paths.tokenizers
|
| 220 |
+
if config.vocab == 'new_smiles':
|
| 221 |
tokenizer = APETokenizer()
|
| 222 |
+
tokenizer.load_vocabulary(f'{tok_dir}/peptide_smiles_600_vocab.json')
|
| 223 |
elif config.vocab == 'old_smiles':
|
| 224 |
+
tokenizer = SMILES_SPE_Tokenizer(f'{tok_dir}/new_vocab.txt',
|
| 225 |
+
f'{tok_dir}/new_splits.txt')
|
| 226 |
elif config.vocab == 'selfies':
|
| 227 |
tokenizer = APETokenizer()
|
| 228 |
+
tokenizer.load_vocabulary(f'{tok_dir}/peptide_selfies_600_vocab.json')
|
| 229 |
elif config.vocab == 'helm':
|
| 230 |
+
tokenizer = HelmTokenizer(f'{tok_dir}/monomer_vocab.txt')
|
| 231 |
|
| 232 |
if config.backbone == 'finetune_roformer':
|
| 233 |
train_dataset = load_dataset('csv', data_files=config.data.train)
|
|
|
|
| 237 |
val_dataset = val_dataset['train']#.select(lst)
|
| 238 |
data_module = dataloader.CustomDataModule(train_dataset, val_dataset, None, tokenizer, batch_size=config.loader.global_batch_size)
|
| 239 |
else:
|
| 240 |
+
data_module = dynamic_dataloader.CustomDataModule(f'{config.paths.data}/smiles/11M_smiles_old_tokenizer_no_limit', tokenizer)
|
| 241 |
|
| 242 |
if config.mode == 'sample_eval':
|
| 243 |
generate_samples(config, logger, tokenizer)
|
|
|
|
| 248 |
|
| 249 |
|
| 250 |
if __name__ == '__main__':
|
| 251 |
+
main()
|
peptune-pretrained.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b259f022c21121f5c755fed61230d6fdf2626ee4ab8a23df479b3cf553fd4aef
|
| 3 |
+
size 1386966244
|