Bioinformatics
Although I am not a student in bioinformatics, I have noticed that computational biology is a very interesting and increasingly important field. That is why I am trying to familiarize myself with Python. Some of the projects are shared here. Currently they are quite simple, but I am regurarely expanding upon them.
I am aware that various AI-programs have already greatly superseded what can be found on this page, and that this skill gap will only widen more. But these little projects are still a good practice for me to understand the basics of programming. And it will also help me better comprehend research papers that use modelling techniques.
Bird Sound Recognizer
This program is inspired by Observation.org (Waarneming.nl). You add the pathway to your own recording of a bird (in .wav format) and the program will try to determine the species from a database. The database is created by myself using verified audio sounds which are avaible from Observation.org. The database is currently quite small but I am trying to find a more streamlined approach to expand the database. The model itself is very simple, as it is mostly for me to learn the basics behind machine learning.
Because the project will get more updates I created a project page on GitHub.
Expand to see the code
import os
import librosa
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Function to extract MFCC features from audio
def extract_features(file_path):
y, sr = librosa.load(file_path, duration=5.0) # Load audio file
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) # Extract MFCC features
return np.mean(mfcc.T, axis=0) # Take the mean of MFCC
# Prepare the dataset
def prepare_data(dataset_path):
features = []
labels = []
# Folders named after bird species
for label in os.listdir(dataset_path):
species_path = os.path.join(dataset_path, label)
if os.path.isdir(species_path):
for file in os.listdir(species_path):
if file.endswith(".wav"):
file_path = os.path.join(species_path, file)
mfccs = extract_features(file_path)
features.append(mfccs)
labels.append(label)
return np.array(features), np.array(labels)
# Load and split dataset
dataset_path = r'C:\Users\Private\Desktop\Python\Birds\Bird_sound_data'
X, y = prepare_data(dataset_path)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a simple classifier (SVM)
model = SVC(kernel="linear")
model.fit(X_train, y_train)
# Test the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
# Predict a new sound
def predict_bird_sound(file_path):
mfccs = extract_features(file_path)
prediction = model.predict([mfccs])
print("Predicted Bird Species:", prediction[0])
# Put here the bird sound that you want to know the species of
predict_bird_sound(r'C:\Users\Private\Desktop\Python\Birds\Bird_sound_data\Parus_major\parus_major_test.wav')
Protein Visualizer
This code visualizes the 3D-structure of a protein using the information from the Protein Data Bank.
Expand to see the code
import sys
import numpy as np
from Bio.PDB import PDBList, PDBParser, Polypeptide
import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QPushButton, QLineEdit, QLabel
class ProteinVisualizer(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setWindowTitle('Protein Structure Visualizer')
layout = QVBoxLayout()
self.label = QLabel('Enter PDB ID:')
layout.addWidget(self.label)
self.pdb_input = QLineEdit(self)
layout.addWidget(self.pdb_input)
self.fetch_button = QPushButton('Fetch and Visualize', self)
self.fetch_button.clicked.connect(self.fetch_and_visualize)
layout.addWidget(self.fetch_button)
self.canvas = FigureCanvas(plt.Figure())
layout.addWidget(self.canvas)
self.setLayout(layout)
self.show()
def fetch_and_visualize(self):
pdb_id = self.pdb_input.text().strip().upper()
if not pdb_id:
return
# Fetch the PDB file
pdbl = PDBList()
filename = pdbl.retrieve_pdb_file(pdb_id, file_format='pdb', overwrite=True)
parser = PDBParser(QUIET=True)
structure = parser.get_structure(pdb_id, filename)
# Prepare data for visualization
self.visualize_structure(structure)
def visualize_structure(self, structure):
ax = self.canvas.figure.add_subplot(111, projection='3d')
ax.cla()
ax.set_title('Protein Structure')
# Extract coordinates to visualize
atoms = []
for model in structure:
for chain in model:
for residue in chain:
if Polypeptide.is_aa(residue):
for atom in residue.get_atoms():
atoms.append(atom.get_coord())
atoms = np.array(atoms)
ax.scatter(atoms[:, 0], atoms[:, 1], atoms[:, 2], c='blue', s=1)
ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.set_zlabel('Z Axis')
self.canvas.draw()
if __name__ == '__main__':
app = QApplication(sys.argv)
viewer = ProteinVisualizer()
sys.exit(app.exec_())
Needleman-Wunsch
This is a small program for reading DNA or protein sequences from a FASTA file and applying the Needleman-Wunsch algorithm to perform a global sequence alignment on the two sequences.
Expand to see the code
import numpy as np
def read_fasta(file_path):
with open(file_path, 'r') as f:
sequences = {}
seq_id = ""
seq = ""
for line in f:
line = line.strip()
if line.startswith(">"):
if seq_id:
sequences[seq_id] = seq
seq_id = line[1:]
seq = ""
else:
seq += line
if seq_id:
sequences[seq_id] = seq
return sequences
def needleman_wunsch(seq1, seq2, match=1, mismatch=-1, gap=-2):
n, m = len(seq1), len(seq2)
score_matrix = np.zeros((n + 1, m + 1)) # Create score matrix
# Initialize the scoring matrix with gap penalties
for i in range(n + 1):
score_matrix[i][0] = gap * i
for j in range(m + 1):
score_matrix[0][j] = gap * j
# Fill in the score matrix
for i in range(1, n + 1):
for j in range(1, m + 1):
match_score = score_matrix[i - 1][j - 1] + (match if seq1[i - 1] == seq2[j - 1] else mismatch)
delete_score = score_matrix[i - 1][j] + gap
insert_score = score_matrix[i][j - 1] + gap
score_matrix[i][j] = max(match_score, delete_score, insert_score)
# Traceback
align1, align2 = "", ""
i, j = n, m
while i > 0 and j > 0:
current_score = score_matrix[i][j]
if seq1[i - 1] == seq2[j - 1]:
align1 += seq1[i - 1]
align2 += seq2[j - 1]
i -= 1
j -= 1
elif current_score == score_matrix[i - 1][j] + gap:
align1 += seq1[i - 1]
align2 += "-"
i -= 1
else:
align1 += "-"
align2 += seq2[j - 1]
j -= 1
# Fill in the remaining alignment
while i > 0:
align1 += seq1[i - 1]
align2 += "-"
i -= 1
while j > 0:
align1 += "-"
align2 += seq2[j - 1]
j -= 1
return score_matrix[n][m], align1[::-1], align2[::-1]
def main():
sequences = read_fasta(r'C:\Users\Private\Desktop\Python\FASTA\sequences.fasta')
seq_ids = list(sequences.keys())
if len(seq_ids) < 2:
print("Not enough sequences to align.")
return
seq1 = sequences[seq_ids[0]]
seq2 = sequences[seq_ids[1]]
score, align1, align2 = needleman_wunsch(seq1, seq2)
print(f"Alignment score: {score}\n")
print(f"Aligned Sequence 1:\n{align1}")
print(f"Aligned Sequence 2:\n{align2}")
if __name__ == "__main__":
main()
Color Recognizer
This code analyzes an image to find its dominant colors using K-means clustering. When the script is run, it will process the specified image and show its dominant colors. You can specify how many colors you want to see. The colors are given in RGB code.
Expand to see the code
import cv2
import numpy as np
def get_dominant_color(image, k=1):
pixels = image.reshape(-1, 3) # Reshape the image
pixels = np.float32(pixels) # Convert to float
# Define criteria for k-means
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
compactness, labels, centroids = cv2.kmeans(pixels, k, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
# Return the color as an integer tuple
return [tuple(map(int, centroid)) for centroid in centroids]
def main(image_path):
# Load the image
image = cv2.imread(image_path)
if image is None:
print(f"Error: Unable to load image at {image_path}")
return # Exit the function
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert from BGR to RGB
dominant_color = get_dominant_color(image, k=3) # Get the dominant color
print("Dominant colors (RGB):", dominant_color) # Print the dominant colors
cv2.imshow('Image', image) # Show the original image
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == "__main__":
image_path = r'C:\Users\Private\Desktop\cat.jpg' # Provide path to the image you want to check
main(image_path)
Phylogenetic Tree
This code is for constructing and visualizing a very simple phylogenetic tree, in this case a dendogram, using a distance matrix and the hierarchical clustering method.
Expand to see the code
import matplotlib.pyplot
from scipy.cluster.hierarchy import linkage, dendrogram
class PhylogeneticTree:
def __init__(self, distance_matrix):
self.distance_matrix = distance_matrix
def calculate_linkage(self):
# Try 'average' or 'complete' instead of 'single'
return linkage(self.distance_matrix, method='average')
def construct_tree(self, linkage_matrix):
self.linkage_matrix = linkage_matrix
def plot_tree(self):
dendrogram(self.linkage_matrix, labels=['Species A', 'Species B', 'Species C', 'Species D'])
matplotlib.pyplot.show()
# Example usage with custom data
distance_matrix = [
[0, 0.5, 0.7, 0.9], # Distances from Species 1
[0.5, 0, 0.6, 0.8], # Distances from Species 2
[0.7, 0.6, 0, 0.1], # Distances from Species 3
[0.9, 0.8, 0.1, 0] # Distances from Species 4
]
phylogenetic_tree = PhylogeneticTree(distance_matrix)
linkage_matrix = phylogenetic_tree.calculate_linkage()
phylogenetic_tree.construct_tree(linkage_matrix)
phylogenetic_tree.plot_tree()

Cellular Automata
This codes the well known Conway’s Game of Life. When a cell has one or no neighbours, it dies. If the cell has two or three neighbours, the cell will live. If a cell has three live neighbours, it will be born. If a cell has more than three neighbours, it will die. It is a very simple model that represents how living systems, such as population dynamics, evolve with simple rules.
Expand to see the code
import numpy
import matplotlib.pyplot
import matplotlib.animation as animation
# Setup parameters
width, height = 50, 50 # Grid size
num_iterations = 100 # Number of iterations to simulate
initial_live_cells_ratio = 0.2 # Fraction of initial live cells
# Initialize the grid
def initialize_grid(width, height, live_cells_ratio):
grid = numpy.random.choice([0, 1], size=(width, height), p=[1-live_cells_ratio, live_cells_ratio])
return grid
# Rules Conway's Game of Life
def update_grid(grid):
new_grid = grid.copy()
for i in range(grid.shape[0]):
for j in range(grid.shape[1]):
# Sum the neighboring cells
total = (grid[i, (j-1) % height] + grid[i, (j+1) % height] +
grid[(i-1) % width, j] + grid[(i+1) % width, j] +
grid[(i-1) % width, (j-1) % height] + grid[(i-1) % width, (j+1) % height] +
grid[(i+1) % width, (j-1) % height] + grid[(i+1) % width, (j+1) % height])
# Apply rules for a simple growth model
if grid[i, j] == 1: # live cell
if total < 2 or total > 3:
new_grid[i, j] = 0 # die
else: # Dead cell
if total == 3:
new_grid[i, j] = 1 # Become alive
return new_grid
# Animate the simulation
def animate(i):
global grid
grid = update_grid(grid)
mat.set_array(grid)
return [mat]
# Grid
grid = initialize_grid(width, height, initial_live_cells_ratio)
# Plot
fig, ax = matplotlib.pyplot.subplots()
mat = ax.matshow(grid, cmap='binary')
matplotlib.pyplot.axis('off') # Hide the axes
# Create the animation
ani = animation.FuncAnimation(fig, animate, frames=num_iterations, interval=100)
# Show the animation
matplotlib.pyplot.show()
GC Content
This counts the GC content and the total amount of nucleotides of a FASTA-file.
Expand to see the code
from Bio import SeqIO
# Calculate GC content
def gc_content(sequence):
g_count = sequence.count('G')
c_count = sequence.count('C')
gc_percent = ((g_count + c_count) / len(sequence)) * 100
return gc_percent
# Function to read FASTA file, count GC content, and total nucleotides
def process_fasta(file_path):
with open(file_path, "r") as fasta_file:
for record in SeqIO.parse(fasta_file, "fasta"):
sequence = record.seq
gc_percent = gc_content(sequence)
total_nucleotides = len(sequence)
print(f"Sequence ID: {record.id}")
print(f"GC Content: {gc_percent:.2f}%")
print(f"Total Nucleotides: {total_nucleotides}\n")
# Main function
if __name__ == "__main__":
fasta_file_path = r"C:\Users\Private\Desktop\Python\GC content\dut.fasta"
process_fasta(fasta_file_path)
Bacterial Growth Plot
This is a very basic bacterial growth simulation. Its a simple formula for exponential growth with a maximum carrying capacity. For example, the growth of a bacterial colony. You type in your own parameters and when you run the code it will plot the graph.
Expand to see the code
import numpy
import matplotlib.pyplot
def bacterial_growth_logistic(N0, r, K, t):
N = K / (1 + (K - N0) / N0 * numpy.exp(-r * t)) #Formula
return N
# Parameters
N0 = 15 # Population at start
r = 0.5 # Growth rate
K = 1000 # Carrying capacity
t = numpy.linspace(0, 20, 50) # Start day, final day, amount of measurement points in between
# Simulation
N = bacterial_growth_logistic(N0, r, K, t)
# Plot the results
matplotlib.pyplot.figure(figsize=(10, 5))
matplotlib.pyplot.plot(t, N, label='Logistic Growth', color='green')
matplotlib.pyplot.title('Bacterial Growth Simulation')
matplotlib.pyplot.xlabel('Time (days)')
matplotlib.pyplot.ylabel('Population Size')
matplotlib.pyplot.axhline(y=K, color='red', linestyle='--', label='Carrying Capacity')
matplotlib.pyplot.legend()
matplotlib.pyplot.grid()
matplotlib.pyplot.show()
Biodiversity calculator
This is a very basic calculator for two common ways to represent biodiversity of a community: the Shannon-Weiner Index and the Simpon’s Index.
Expand to see the code
import math
def shannon_wiener_index(counts):
total = sum(counts)
proportions = [count / total for count in counts]
H = -sum(p * math.log(p) for p in proportions if p > 0)
return H
def simpsons_index(counts):
total = sum(counts)
proportions = [count / total for count in counts]
D = 1 - sum(p ** 2 for p in proportions)
return D
def diversity_index_calculator(counts):
H = shannon_wiener_index(counts) # Shannon-Wiener Index
D = simpsons_index(counts) # Simpson's Index
return H, D
species_counts = [5, 34, 23, 5, 11] # Fill in your own counts
H, D = diversity_index_calculator(species_counts)
print(f"Shannon-Wiener Index: {H:.4f}")
print(f"Simpson's Index: {D:.4f}")
Biology Quiz
A simple quiz where you can add your own questions and answers. At the end it gives your score.
Expand to see the code
# Create your own questions and answers
questions = {
"How many chromosomes has a human?": "46",
"What is the most common CRISPR-associated nuclease?": "Cas9",
"How many codons are there in the genetic code?": "64",
"To what division of fungi do most mushrooms that are visible by eye belong?": "Basidiomycota"
}
# Score
score = 0
# Quiz
for question, correct_answer in questions.items():
answer = input(question + " ")
# Check answer
if answer.strip().lower() == correct_answer.lower():
print("Correct!")
score += 1
else:
print("Wrong! The correct answer is", correct_answer)
# Display the final score
print(f"\nYour final score is {score} out of {len(questions)}.")