Fix code example
Browse files
README.md
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
---
|
| 2 |
-
language: protein
|
| 3 |
tags:
|
| 4 |
- protein language model
|
| 5 |
datasets:
|
|
@@ -38,29 +37,29 @@ We have also noticed that for feature extraction, its better to use the feature
|
|
| 38 |
Here is how to use this model to extract the features of a given protein sequence in PyTorch:
|
| 39 |
|
| 40 |
```python
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]
|
| 52 |
-
|
| 53 |
-
ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True)
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
-
encoder_embedding = embedding[2].cpu().numpy()
|
| 63 |
-
decoder_embedding = embedding[0].cpu().numpy()
|
| 64 |
```
|
| 65 |
|
| 66 |
## Training data
|
|
|
|
| 1 |
---
|
|
|
|
| 2 |
tags:
|
| 3 |
- protein language model
|
| 4 |
datasets:
|
|
|
|
| 37 |
Here is how to use this model to extract the features of a given protein sequence in PyTorch:
|
| 38 |
|
| 39 |
```python
|
| 40 |
+
sequence_examples = ["PRTEINO", "SEQWENCE"]
|
| 41 |
+
# this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
|
| 42 |
+
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
|
| 43 |
|
| 44 |
+
# tokenize sequences and pad up to the longest sequence in the batch
|
| 45 |
+
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
|
| 46 |
+
input_ids = torch.tensor(ids['input_ids']).to(device)
|
| 47 |
+
attention_mask = torch.tensor(ids['attention_mask']).to(device)
|
| 48 |
|
| 49 |
+
# generate embeddings
|
| 50 |
+
with torch.no_grad():
|
| 51 |
+
embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
# extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7])
|
| 54 |
+
emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)
|
| 55 |
+
print(f"Shape of per-residue embedding of first sequences: {emb_0.shape}")
|
| 56 |
+
# do the same for the second ([1,:]) sequence in the batch while taking into account different sequence lengths ([1,:8])
|
| 57 |
+
emb_1 = embedding_repr.last_hidden_state[1,:8] # shape (8 x 1024)
|
| 58 |
|
| 59 |
+
# if you want to derive a single representation (per-protein embedding) for the whole protein
|
| 60 |
+
emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
|
| 61 |
|
| 62 |
+
print(f"Shape of per-protein embedding of first sequences: {emb_0_per_protein.shape}")
|
|
|
|
|
|
|
| 63 |
```
|
| 64 |
|
| 65 |
## Training data
|