Exercise 007¶
# Please execute this cell to download the necessary data
!wget https://raw.githubusercontent.com/JR-1991/PythonProgrammingBio24/main/scripts/utils.py
!wget https://raw.githubusercontent.com/JR-1991/PythonProgrammingBio24/main/data/single_sequence.fasta
!pip install pydantic biopython
from utils import CODON_TABLE, to_triplets
from Bio import pairwise2
def get_identity(seq1: str, seq2: str):
"""Aligns two sequences using BioPython
Args:
seq1 (str): Query sequence to align to
seq2 (str): Target sequence to align with
Returns:
float: Identity of the resulting alignment
"""
return pairwise2.align.globalxx(seq1, seq2, score_only=True) / len(seq1)
DNASequence class¶
Construct a DNASequence
class that contains the following attributes:
id
sequence
organism
gc_content
length
reverse_complement
Next, implement methods for your class that perform the following tasks:
to_amino_acid
: Converts the nucelic acid sequence to an amino acid sequence.align
: Takes another sequence and aligns it against the instance sequence.__repr__
: Define how the contents of your class should be printed.from_fasta
: Define a classmethod that parses a single FASTA entry into your class.
Demonstrate your class by parsing the single_sequence.fasta
file either manually or via the from_fasta
-classmethod.
Tips
- Feel free to use the
get_identity
-function of the previous exercise.- When implementing the
classmethod
make sure to check if the format is correct. We have so far followed the>[Organism]|[ID]\n[Sequence]
format.- Translate your sequence using the supported
to_triplets
function andCODON_TABLE
dictionary.- Not familiar with reverse complements? Find more info here
- Dont hesitate using the
dataclass
decorator. It can help you in some ways already. Learn more on how to implement__post_init__
to maximize customizability here- Python lacks type validation and thus you do have limited control of what flows into your class. PyDantic is an excellent tool to solve this and other issues. Try it out to make your life easier!
class DNASequence:
"""
A class to represent a DNA sequence with various utility methods.
Attributes:
sequence (str): The DNA sequence.
id (str): The identifier for the DNA sequence.
organism (str): The organism from which the DNA sequence is derived.
gc_content (float): The GC content of the DNA sequence.
reverse_complement (str): The reverse complement of the DNA sequence.
"""
def __init__(self, sequence: str, id: str, organism: str):
"""
Initializes the DNASequence with a sequence, id, and organism.
Args:
sequence (str): The DNA sequence.
id (str): The identifier for the DNA sequence.
organism (str): The organism from which the DNA sequence is derived.
"""
self.sequence = sequence
self.id = id
self.organism = organism
self.gc_content = self.gc_content()
self.reverse_complement = self.reverse_complement()
def __repr__(self) -> str:
attrs = "\n".join([
f"\t{attr}: {value},"
for attr, value in self.__dict__.items()
])
return f"{type(self).__name__}\n{attrs}"
def gc_content(self) -> float:
"""
Computes the GC content of the DNA sequence.
Returns:
float: The GC content as a fraction of the total sequence length.
"""
return (
self.sequence.upper().count("G")
+ self.sequence.upper().count("C")
) / len(self.sequence)
def reverse_complement(self) -> str:
"""
Computes the reverse complement of the DNA sequence.
Returns:
str: The reverse complement of the DNA sequence.
"""
mapping = {
"A": "T",
"T": "A",
"G": "C",
"C": "G",
}
return "".join(
[mapping[nuc] for nuc in self.sequence[::-1]]
)
@classmethod
def from_fasta(cls, fasta: str):
"""
Creates a DNASequence instance from a FASTA formatted string.
Args:
fasta (str): A string in the format >[Organism]|[ID]\n[Sequence].
Returns:
DNASequence: An instance of the DNASequence class.
Raises:
ValueError: If the FASTA string is not in the expected format.
"""
try:
header, seq = fasta.split("\n")
except ValueError:
raise ValueError(
"The given DNA sequence does not follow the expected format >[Organism]|[ID]\n[Sequence]"
)
organism, id = header.split("|")
return cls(
sequence=seq.strip().upper(),
organism=organism.lstrip(">"),
id=id,
)
def to_amino_acid(self) -> str:
"""
Converts the DNA sequence to its corresponding amino acid sequence.
Returns:
str: The amino acid sequence.
"""
return "".join(
[CODON_TABLE[tripl] for tripl in to_triplets(self.sequence)]
)
def align(self, seq: 'DNASequence') -> float:
"""
Aligns the DNA sequence with another sequence and computes the identity.
Args:
seq (DNASequence): Another DNASequence instance to align with.
Returns:
float: The identity score of the alignment.
"""
if isinstance(seq, DNASequence):
seq = seq.sequence
return get_identity(self.sequence, seq)
# Load the single sequence
fasta = open("./single_sequence.fasta").read().strip()
obj = DNASequence.from_fasta(fasta)
print(f"ID: {obj.id}")
print(f"Organism: {obj.organism}")
print(f"GC: {obj.gc_content}")
print(f"Reverse complement: {obj.reverse_complement[:10]}...")
print(f"Amino acid sequence: {obj.to_amino_acid()[:10]}...")
ID: 1 Organism: ecoli GC: 0.5074971164936563 Reverse complement: TTATTTTAAA... Amino acid sequence: MRSRYLLHQY...
Using dataclasses¶
Dataclasses are a way to simplify the creation of classes that are primarily used to store data. Think of them as a more convenient way to create classes that hold information about something, like a student, book, or car, without having to write a lot of boilerplate code.
Why Use Dataclasses?¶
- Less Repetition: With dataclasses, you don’t have to manually write the code to store and retrieve the data for each attribute. The dataclass automatically takes care of it.
- Built-In Features: Dataclasses automatically provide useful features like a nice string representation of the object and the ability to compare two objects to see if they are the same.
- Readable Code: Your code becomes cleaner and easier to read because you focus only on what attributes your data should have, not on the repetitive code needed to manage those attributes.
When Dataclasses Shine¶
Dataclasses are particularly useful when you need to:
- Store Simple Data: Whenever you have objects that primarily exist to hold and transfer data.
- Avoid Boilerplate Code: Reduce the amount of repetitive code you have to write, making your codebase cleaner and easier to maintain.
- Compare Objects: Easily compare two objects to see if they are equal, without writing additional code.
Using __post_init__
¶
Sometimes, you might need to perform some additional actions after the dataclass has been initialized. This is where the post_init method comes in handy. Think of it as a place to do extra setup or validation right after your dataclass has been created. For example, if you need to ensure that a certain attribute always follows a specific rule or needs to be calculated based on other attributes, you can use __post_init__
to handle that.
from dataclasses import dataclass, field
@dataclass
class DNASequence:
"""
A class to represent a DNA sequence with various utility methods.
Attributes:
sequence (str): The DNA sequence.
id (str): The identifier for the DNA sequence.
organism (str): The organism from which the DNA sequence is derived.
gc_content (float): The GC content of the DNA sequence.
reverse_complement (str): The reverse complement of the DNA sequence.
"""
sequence: str
id: str
organism: str
gc_content: float = field(init=False)
reverse_complement: str = field(init=False)
def __post_init__(self):
"""
Post-initialization processing to calculate GC content and reverse complement.
"""
self.gc_content = self.compute_gc_content()
self.reverse_complement = self.compute_reverse_complement()
def compute_gc_content(self) -> float:
"""
Computes the GC content of the DNA sequence.
Returns:
float: The GC content as a fraction of the total sequence length.
"""
return (
self.sequence.upper().count("G")
+ self.sequence.upper().count("C")
) / len(self.sequence)
def compute_reverse_complement(self) -> str:
"""
Computes the reverse complement of the DNA sequence.
Returns:
str: The reverse complement of the DNA sequence.
"""
mapping = {
"A": "T",
"T": "A",
"G": "C",
"C": "G",
}
return "".join(
[mapping[nuc] for nuc in self.sequence[::-1]]
)
@classmethod
def from_fasta(cls, fasta: str):
"""
Creates a DNASequence instance from a FASTA formatted string.
Args:
fasta (str): A string in the format >[Organism]|[ID]\n[Sequence].
Returns:
DNASequence: An instance of the DNASequence class.
Raises:
ValueError: If the FASTA string is not in the expected format.
"""
try:
header, seq = fasta.split("\n")
except ValueError:
raise ValueError(
"The given DNA sequence does not follow the expected format >[Organism]|[ID]\n[Sequence]"
)
organism, id = header.split("|")
return cls(
sequence=seq.strip().upper(),
organism=organism.lstrip(">"),
id=id,
)
def to_amino_acid(self) -> str:
"""
Converts the DNA sequence to its corresponding amino acid sequence.
Returns:
str: The amino acid sequence.
"""
return "".join(
[CODON_TABLE[tripl] for tripl in to_triplets(self.sequence)]
)
def align(self, seq: 'DNASequence') -> float:
"""
Aligns the DNA sequence with another sequence and computes the identity.
Args:
seq (DNASequence): Another DNASequence instance to align with.
Returns:
float: The identity score of the alignment.
"""
if isinstance(seq, DNASequence):
seq = seq.sequence
return get_identity(self.sequence, seq)
# Load the single sequence
fasta = open("./single_sequence.fasta").read().strip()
obj = DNASequence.from_fasta(fasta)
print(f"ID: {obj.id}")
print(f"Organism: {obj.organism}")
print(f"GC: {obj.gc_content}")
print(f"Reverse complement: {obj.reverse_complement[:10]}...")
print(f"Amino acid sequence: {obj.to_amino_acid()[:10]}...")
ID: 1 Organism: ecoli GC: 0.5074971164936563 Reverse complement: TTATTTTAAA... Amino acid sequence: MRSRYLLHQY...
Using PyDantic¶
Pydantic is a library that helps you define data models with built-in data validation. Think of it as a way to create classes that not only hold information but also automatically check that the information is correct. For instance, if you expect a user’s age to be a positive integer, Pydantic ensures it is, right when you create the object.
Why Use Pydantic?¶
- Automatic Validation: Pydantic checks the data types and values for you, ensuring that the data your program works with is always valid.
- Easy to Use: It’s straightforward to define data models with Pydantic, making your code cleaner and easier to understand.
- Error Handling: Pydantic provides clear and helpful error messages if the data doesn’t meet the expected format, making debugging easier.
- Built-In Parsing: Pydantic can automatically convert input data to the correct type, saving you from writing additional conversion code.
When Pydantic Shines¶
Pydantic is especially useful when you need to:
- Validate Input Data: Ensure that the data coming into your program (from user input, APIs, etc.) is in the correct format and meets all your requirements.
- Simplify Data Handling: Reduce the amount of manual validation and conversion code you need to write, making your codebase more maintainable.
- Work with APIs: When dealing with data from external sources, Pydantic helps ensure that the data you receive and send out is always in the expected format.
Pydantic makes your code more reliable by automatically checking and validating the data, allowing you to focus on building features instead of writing boilerplate validation code.
from pydantic import BaseModel, computed_field
class DNASequence(BaseModel):
"""
A class to represent a DNA sequence with various utility methods.
Attributes:
sequence (str): The DNA sequence.
id (str): The identifier for the DNA sequence.
organism (str): The organism from which the DNA sequence is derived.
"""
sequence: str
id: str
organism: str
@computed_field
@property
def gc_content(self) -> float:
"""
Computes the GC content of the DNA sequence.
Returns:
float: The GC content as a fraction of the total sequence length.
"""
return (
self.sequence.upper().count("G")
+ self.sequence.upper().count("C")
) / len(self.sequence)
@computed_field
@property
def reverse_complement(self) -> str:
"""
Computes the reverse complement of the DNA sequence.
Returns:
str: The reverse complement of the DNA sequence.
"""
mapping = {
"A": "T",
"T": "A",
"G": "C",
"C": "G",
}
return "".join(
[mapping[nuc] for nuc in self.sequence[::-1]]
)
@classmethod
def from_fasta(cls, fasta: str):
"""
Creates a DNASequence instance from a FASTA formatted string.
Args:
fasta (str): A string in the format >[Organism]|[ID]\n[Sequence].
Returns:
DNASequence: An instance of the DNASequence class.
Raises:
ValueError: If the FASTA string is not in the expected format.
"""
try:
header, seq = fasta.split("\n")
except ValueError:
raise ValueError(
"The given DNA sequence does not follow the expected format >[Organism]|[ID]\n[Sequence]"
)
organism, id = header.split("|")
return cls(
sequence=seq.strip().upper(),
organism=organism.lstrip(">"),
id=id,
)
def to_amino_acid(self) -> str:
"""
Converts the DNA sequence to its corresponding amino acid sequence.
Returns:
str: The amino acid sequence.
"""
return "".join(
[CODON_TABLE[tripl] for tripl in to_triplets(self.sequence)]
)
def align(self, seq: 'DNASequence') -> float:
"""
Aligns the DNA sequence with another sequence and computes the identity.
Args:
seq (DNASequence): Another DNASequence instance to align with.
Returns:
float: The identity score of the alignment.
"""
if isinstance(seq, DNASequence):
seq = seq.sequence
return get_identity(self.sequence, seq)
# Load the single sequence
fasta = open("./single_sequence.fasta").read().strip()
obj = DNASequence.from_fasta(fasta)
print(f"ID: {obj.id}")
print(f"Organism: {obj.organism}")
print(f"GC: {obj.gc_content}")
print(f"Reverse complement: {obj.reverse_complement[:10]}...")
print(f"Amino acid sequence: {obj.to_amino_acid()[:10]}...")
ID: 1 Organism: ecoli GC: 0.5074971164936563 Reverse complement: TTATTTTAAA... Amino acid sequence: MRSRYLLHQY...