!pip install git+https://github.com/evolutionaryscale/esm.git
!pip install py3dmol

import biotite.structure as bs
import py3Dmol
from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, GenerationConfig
from esm.sdk.experimental import ESM3GuidedDecoding, GuidedDecodingScoringFunction

# Create scoring function (e.g. PTM scoring function)
class PTMScoringFunction(GuidedDecodingScoringFunction):
    def __call__(self, protein: ESMProtein) -> float:
        # Minimal example of a scoring function that scores proteins based on their pTM score
        # Given that ESM3 already has a pTM prediction head, we can directly access the pTM score
        assert protein.ptm is not None, "Protein must have pTM scores to be scored"
        return float(protein.ptm)

# To use the tokenizers and the open model you'll need to login into Hugging Face
# ! pip install ipywidgets
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Locally with ESM3-open
model = ESM3.from_pretrained().to("cuda")

## On Forge with larger ESM3 models
# from getpass import getpass

# from esm.sdk import client

# token = getpass("Token from Forge console: ")
# model = client(model="esm3-open", url="https://forge.evolutionaryscale.ai", token=token)

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

hyperplanes_8bit_68103.npz:   0%|          | 0.00/34.9M [00:00<?, ?B/s]

hyperplanes_8bit_58641.npz:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1utn.pdb:   0%|          | 0.00/569k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

esm3_entry.list:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

entry_list_safety_29026.list:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

ParentChildTreeFile.txt:   0%|          | 0.00/595k [00:00<?, ?B/s]

interpro2keywords.csv:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

ptm_guided_decoding = ESM3GuidedDecoding(
    client=model, scoring_function=PTMScoringFunction()
)

# Start from a fully masked protein
PROTEIN_LENGTH = 256
starting_protein = ESMProtein(sequence="_" * PROTEIN_LENGTH)

# Call guided_generate
generated_protein = ptm_guided_decoding.guided_generate(
    protein=starting_protein,
    num_decoding_steps=len(starting_protein) // 8,
    num_samples_per_step=10,
)

Current score: 0.95: 100%|██████████| 32/32 [00:27<00:00,  1.15it/s]

# Generate a protein WITHOUT guidance
generated_protein_no_guided: ESMProtein = model.generate(
    input=starting_protein,
    config=GenerationConfig(track="sequence", num_steps=len(starting_protein) // 8),
)  # type: ignore

# Fold
generated_protein_no_guided: ESMProtein = model.generate(
    input=generated_protein_no_guided,
    config=GenerationConfig(track="structure", num_steps=1),
)  # type: ignore

100%|██████████| 32/32 [00:00<00:00, 41.03it/s]
100%|██████████| 1/1 [00:00<00:00, 41.38it/s]

# Create a 1x2 grid of viewers (1 row, 2 columns)
view = py3Dmol.view(width=1000, height=500, viewergrid=(1, 2))

# Convert ESMProtein objects to ProteinChain objects
protein_chain1 = generated_protein_no_guided.to_protein_chain()
protein_chain2 = generated_protein.to_protein_chain()

# Add models to respective panels
view.addModel(protein_chain1.to_pdb_string(), "pdb", viewer=(0, 0))
view.addModel(protein_chain2.to_pdb_string(), "pdb", viewer=(0, 1))

# Set styles for each protein
view.setStyle({}, {"cartoon": {"color": "spectrum"}}, viewer=(0, 0))
view.setStyle({}, {"cartoon": {"color": "spectrum"}}, viewer=(0, 1))

# Zoom and center the view
view.zoomTo()
view.show()

class NoCysteineScoringFunction(GuidedDecodingScoringFunction):
    def __call__(self, protein: ESMProtein) -> float:
        # Penalize proteins that contain cysteine
        assert protein.sequence is not None, "Protein must have a sequence to be scored"
        # Note that we use a negative score here, to discourage the presence of cysteine
        return -protein.sequence.count("C")

no_cysteine_guided_decoding = ESM3GuidedDecoding(
    client=model, scoring_function=NoCysteineScoringFunction()
)

no_cysteine_protein = no_cysteine_guided_decoding.guided_generate(
    protein=starting_protein,
    num_decoding_steps=len(starting_protein) // 8,
    num_samples_per_step=10,
)

Current score: 0.00: 100%|██████████| 32/32 [00:25<00:00,  1.23it/s]

assert no_cysteine_protein.sequence is not None, "Protein must have a sequence"
print(no_cysteine_protein.sequence)
print(f"Number of cysteine residues: {no_cysteine_protein.sequence.count('C')}")

MANKILKNLRTTSKYISSRTTSRLTAYLIGFAEPRGLELLPITPAGRNPNDLLKLLSERIGWVSKRFSIKNVTVGSLVPINTSAVNVYRRTLSKTKTSLQSEVSTRQGTYTIPVNSFAIIEYTNLRKFIEELAGVKVRKVEFLLNEESLIIKIIPYISKDVQELRQLKVDIPKEIIEQFFGKSSIDKISKKFNKNNRIVEEKRKDYSREYYDIRTFPVENNEFKGSAEILSTHPVYVFETKNHQVESGVFLPLEIF
Number of cysteine residues: 0

class RadiousOfGyrationScoringFunction(GuidedDecodingScoringFunction):
    def __call__(self, protein: ESMProtein) -> float:
        score = -1 * self.radius_of_gyration(protein)

        assert protein.ptm is not None, "Protein must have pTM scores to be scored"
        if protein.ptm < 0.5:
            # Penalize proteins with low pTM scores
            score = score * 2

        return score

    @staticmethod
    def radius_of_gyration(protein: ESMProtein) -> float:
        protein_chain = protein.to_protein_chain()
        arr = protein_chain.atom_array_no_insertions
        return bs.gyration_radius(arr)

radius_guided_decoding = ESM3GuidedDecoding(
    client=model, scoring_function=RadiousOfGyrationScoringFunction()
)

radius_guided_protein = radius_guided_decoding.guided_generate(
    protein=starting_protein,
    num_decoding_steps=len(starting_protein) // 8,
    num_samples_per_step=10,
)

Current score: -16.94: 100%|██████████| 32/32 [00:34<00:00,  1.08s/it]

view = py3Dmol.view(width=800, height=400)
view.addModel(radius_guided_protein.to_pdb_string(), "pdb")
view.setStyle({"cartoon": {"color": "spectrum"}})
view.zoomTo()

<py3Dmol.view at 0x7f5886122490>

Tutorial 5: Guided Generation with ESM3¶

Imports¶

Creating a scoring function¶

Initialize your client¶

Guide towards high pTM for improved generation quality¶

Compare against baseline with no guidance¶

Generate a Protein with No Cysteines¶

Maximize Globularity¶