Bioinformatics

Although I am not a student in bioinformatics, I have noticed that computational biology is a very interesting and increasingly important field. That is why I am trying to familiarize myself with Python. Some of the projects are shared here. Currently they are quite simple, but I am regurarely expanding upon them.

I am aware that various AI-programs have already greatly superseded what can be found on this page, and that this skill gap will only widen more. But these little projects are still a good practice for me to understand the basics of programming. And it will also help me better comprehend research papers that use modelling techniques.

Bird Sound Recognizer

This program is inspired by Observation.org (Waarneming.nl). You add the pathway to your own recording of a bird (in .wav format) and the program will try to determine the species from a database. The database is created by myself using verified audio sounds which are avaible from Observation.org. The database is currently quite small but I am trying to find a more streamlined approach to expand the database. The model itself is very simple, as it is mostly for me to learn the basics behind machine learning.

Because the project will get more updates I created a project page on GitHub.

Expand to see the code

import os
import librosa
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function to extract MFCC features from audio
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=5.0)  # Load audio file
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Extract MFCC features
    return np.mean(mfcc.T, axis=0)  # Take the mean of MFCC

# Prepare the dataset
def prepare_data(dataset_path):
    features = []
    labels = []
    
    # Folders named after bird species
    for label in os.listdir(dataset_path):
        species_path = os.path.join(dataset_path, label)
        if os.path.isdir(species_path):
            for file in os.listdir(species_path):
                if file.endswith(".wav"):
                    file_path = os.path.join(species_path, file)
                    mfccs = extract_features(file_path)
                    features.append(mfccs)
                    labels.append(label)
    
    return np.array(features), np.array(labels)

# Load and split dataset
dataset_path = r'C:\Users\Private\Desktop\Python\Birds\Bird_sound_data'
X, y = prepare_data(dataset_path)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple classifier (SVM)
model = SVC(kernel="linear")
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Predict a new sound
def predict_bird_sound(file_path):
    mfccs = extract_features(file_path)
    prediction = model.predict([mfccs])
    print("Predicted Bird Species:", prediction[0])

# Put here the bird sound that you want to know the species of
predict_bird_sound(r'C:\Users\Private\Desktop\Python\Birds\Bird_sound_data\Parus_major\parus_major_test.wav')

Protein Visualizer

This code visualizes the 3D-structure of a protein using the information from the Protein Data Bank.

Expand to see the code

import sys
import numpy as np
from Bio.PDB import PDBList, PDBParser, Polypeptide
import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QPushButton, QLineEdit, QLabel


class ProteinVisualizer(QWidget):
    def __init__(self):
        super().__init__()
        
        self.initUI()

    def initUI(self):
        self.setWindowTitle('Protein Structure Visualizer')
        
        layout = QVBoxLayout()

        self.label = QLabel('Enter PDB ID:')
        layout.addWidget(self.label)

        self.pdb_input = QLineEdit(self)
        layout.addWidget(self.pdb_input)

        self.fetch_button = QPushButton('Fetch and Visualize', self)
        self.fetch_button.clicked.connect(self.fetch_and_visualize)
        layout.addWidget(self.fetch_button)

        self.canvas = FigureCanvas(plt.Figure())
        layout.addWidget(self.canvas)

        self.setLayout(layout)
        self.show()

    def fetch_and_visualize(self):
        pdb_id = self.pdb_input.text().strip().upper()
        if not pdb_id:
            return

        # Fetch the PDB file
        pdbl = PDBList()
        filename = pdbl.retrieve_pdb_file(pdb_id, file_format='pdb', overwrite=True)

        parser = PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, filename)

        # Prepare data for visualization
        self.visualize_structure(structure)

    def visualize_structure(self, structure):
        ax = self.canvas.figure.add_subplot(111, projection='3d')
        ax.cla()
        ax.set_title('Protein Structure')

        # Extract coordinates to visualize
        atoms = []
        for model in structure:
            for chain in model:
                for residue in chain:
                    if Polypeptide.is_aa(residue):
                        for atom in residue.get_atoms():
                            atoms.append(atom.get_coord())

        atoms = np.array(atoms)

        ax.scatter(atoms[:, 0], atoms[:, 1], atoms[:, 2], c='blue', s=1)
        ax.set_xlabel('X Axis')
        ax.set_ylabel('Y Axis')
        ax.set_zlabel('Z Axis')

        self.canvas.draw()


if __name__ == '__main__':
    app = QApplication(sys.argv)
    viewer = ProteinVisualizer()
    sys.exit(app.exec_())

Needleman-Wunsch

This is a small program for reading DNA or protein sequences from a FASTA file and applying the Needleman-Wunsch algorithm to perform a global sequence alignment on the two sequences.

Expand to see the code

import numpy as np

def read_fasta(file_path):
    with open(file_path, 'r') as f:
        sequences = {}
        seq_id = ""
        seq = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if seq_id:
                    sequences[seq_id] = seq
                seq_id = line[1:]
                seq = ""
            else:
                seq += line
        if seq_id:
            sequences[seq_id] = seq
    return sequences

def needleman_wunsch(seq1, seq2, match=1, mismatch=-1, gap=-2):
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n + 1, m + 1))  # Create score matrix

    # Initialize the scoring matrix with gap penalties
    for i in range(n + 1):
        score_matrix[i][0] = gap * i
    for j in range(m + 1):
        score_matrix[0][j] = gap * j

    # Fill in the score matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            match_score = score_matrix[i - 1][j - 1] + (match if seq1[i - 1] == seq2[j - 1] else mismatch)
            delete_score = score_matrix[i - 1][j] + gap
            insert_score = score_matrix[i][j - 1] + gap
            score_matrix[i][j] = max(match_score, delete_score, insert_score)

    # Traceback
    align1, align2 = "", ""
    i, j = n, m

    while i > 0 and j > 0:
        current_score = score_matrix[i][j]
        if seq1[i - 1] == seq2[j - 1]:
            align1 += seq1[i - 1]
            align2 += seq2[j - 1]
            i -= 1
            j -= 1
        elif current_score == score_matrix[i - 1][j] + gap:
            align1 += seq1[i - 1]
            align2 += "-"
            i -= 1
        else:
            align1 += "-"
            align2 += seq2[j - 1]
            j -= 1

    # Fill in the remaining alignment
    while i > 0:
        align1 += seq1[i - 1]
        align2 += "-"
        i -= 1
    while j > 0:
        align1 += "-"
        align2 += seq2[j - 1]
        j -= 1

    return score_matrix[n][m], align1[::-1], align2[::-1]

def main():
    sequences = read_fasta(r'C:\Users\Private\Desktop\Python\FASTA\sequences.fasta') 
    seq_ids = list(sequences.keys())
    
    if len(seq_ids) < 2:
        print("Not enough sequences to align.")
        return
    
    seq1 = sequences[seq_ids[0]]
    seq2 = sequences[seq_ids[1]]
    
    score, align1, align2 = needleman_wunsch(seq1, seq2)
    
    print(f"Alignment score: {score}\n")
    print(f"Aligned Sequence 1:\n{align1}")
    print(f"Aligned Sequence 2:\n{align2}")

if __name__ == "__main__":
    main()

Color Recognizer

This code analyzes an image to find its dominant colors using K-means clustering. When the script is run, it will process the specified image and show its dominant colors. You can specify how many colors you want to see. The colors are given in RGB code.

Expand to see the code

import cv2
import numpy as np

def get_dominant_color(image, k=1):
    pixels = image.reshape(-1, 3) # Reshape the image
    pixels = np.float32(pixels) # Convert to float

    # Define criteria for k-means
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    compactness, labels, centroids = cv2.kmeans(pixels, k, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)

    # Return the color as an integer tuple
    return [tuple(map(int, centroid)) for centroid in centroids]

def main(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return  # Exit the function

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert from BGR to RGB

    dominant_color = get_dominant_color(image, k=3) # Get the dominant color
    
    print("Dominant colors (RGB):", dominant_color) # Print the dominant colors

    cv2.imshow('Image', image) # Show the original image
    cv2.waitKey(0)
    cv2.destroyAllWindows()

if __name__ == "__main__":
    image_path = r'C:\Users\Private\Desktop\cat.jpg'  # Provide path to the image you want to check
    main(image_path)

Phylogenetic Tree

This code is for constructing and visualizing a very simple phylogenetic tree, in this case a dendogram, using a distance matrix and the hierarchical clustering method.

Expand to see the code

import matplotlib.pyplot
from scipy.cluster.hierarchy import linkage, dendrogram

class PhylogeneticTree:
    def __init__(self, distance_matrix):
        self.distance_matrix = distance_matrix

    def calculate_linkage(self):
        # Try 'average' or 'complete' instead of 'single'
        return linkage(self.distance_matrix, method='average')

    def construct_tree(self, linkage_matrix):
        self.linkage_matrix = linkage_matrix

    def plot_tree(self):
        dendrogram(self.linkage_matrix, labels=['Species A', 'Species B', 'Species C', 'Species D'])
        matplotlib.pyplot.show()

# Example usage with custom data
distance_matrix = [
    [0, 0.5, 0.7, 0.9],  # Distances from Species 1
    [0.5, 0, 0.6, 0.8],  # Distances from Species 2
    [0.7, 0.6, 0, 0.1],  # Distances from Species 3
    [0.9, 0.8, 0.1, 0]   # Distances from Species 4
]

phylogenetic_tree = PhylogeneticTree(distance_matrix)
linkage_matrix = phylogenetic_tree.calculate_linkage()
phylogenetic_tree.construct_tree(linkage_matrix)
phylogenetic_tree.plot_tree()

Cellular Automata

This codes the well known Conway’s Game of Life. When a cell has one or no neighbours, it dies. If the cell has two or three neighbours, the cell will live. If a cell has three live neighbours, it will be born. If a cell has more than three neighbours, it will die. It is a very simple model that represents how living systems, such as population dynamics, evolve with simple rules.

Expand to see the code

import numpy
import matplotlib.pyplot
import matplotlib.animation as animation

# Setup parameters
width, height = 50, 50  # Grid size
num_iterations = 100  # Number of iterations to simulate
initial_live_cells_ratio = 0.2  # Fraction of initial live cells

# Initialize the grid
def initialize_grid(width, height, live_cells_ratio):
    grid = numpy.random.choice([0, 1], size=(width, height), p=[1-live_cells_ratio, live_cells_ratio])
    return grid

# Rules Conway's Game of Life
def update_grid(grid):
    new_grid = grid.copy()
    for i in range(grid.shape[0]):
        for j in range(grid.shape[1]):
            # Sum the neighboring cells
            total = (grid[i, (j-1) % height] + grid[i, (j+1) % height] + 
                     grid[(i-1) % width, j] + grid[(i+1) % width, j] +
                     grid[(i-1) % width, (j-1) % height] + grid[(i-1) % width, (j+1) % height] +
                     grid[(i+1) % width, (j-1) % height] + grid[(i+1) % width, (j+1) % height])
            
            # Apply rules for a simple growth model
            if grid[i, j] == 1:  # live cell
                if total < 2 or total > 3:
                    new_grid[i, j] = 0  # die
            else:  # Dead cell
                if total == 3:
                    new_grid[i, j] = 1  # Become alive
    return new_grid

# Animate the simulation
def animate(i):
    global grid
    grid = update_grid(grid)
    mat.set_array(grid)
    return [mat]

# Grid
grid = initialize_grid(width, height, initial_live_cells_ratio)

# Plot
fig, ax = matplotlib.pyplot.subplots()
mat = ax.matshow(grid, cmap='binary')
matplotlib.pyplot.axis('off')  # Hide the axes

# Create the animation
ani = animation.FuncAnimation(fig, animate, frames=num_iterations, interval=100)

# Show the animation
matplotlib.pyplot.show()

GC Content

This counts the GC content and the total amount of nucleotides of a FASTA-file.

Expand to see the code

from Bio import SeqIO

# Calculate GC content
def gc_content(sequence):
    g_count = sequence.count('G')
    c_count = sequence.count('C')
    gc_percent = ((g_count + c_count) / len(sequence)) * 100
    return gc_percent

# Function to read FASTA file, count GC content, and total nucleotides
def process_fasta(file_path):
    with open(file_path, "r") as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequence = record.seq
            gc_percent = gc_content(sequence)
            total_nucleotides = len(sequence)
            print(f"Sequence ID: {record.id}")
            print(f"GC Content: {gc_percent:.2f}%")
            print(f"Total Nucleotides: {total_nucleotides}\n")

# Main function
if __name__ == "__main__":
    fasta_file_path = r"C:\Users\Private\Desktop\Python\GC content\dut.fasta"
    process_fasta(fasta_file_path)

Bacterial Growth Plot

This is a very basic bacterial growth simulation. Its a simple formula for exponential growth with a maximum carrying capacity. For example, the growth of a bacterial colony. You type in your own parameters and when you run the code it will plot the graph.

Expand to see the code

import numpy
import matplotlib.pyplot

def bacterial_growth_logistic(N0, r, K, t):
    N = K / (1 + (K - N0) / N0 * numpy.exp(-r * t)) #Formula
    return N

# Parameters
N0 = 15 # Population at start
r = 0.5 # Growth rate
K = 1000 # Carrying capacity
t = numpy.linspace(0, 20, 50) # Start day, final day, amount of measurement points in between

# Simulation
N = bacterial_growth_logistic(N0, r, K, t)

# Plot the results
matplotlib.pyplot.figure(figsize=(10, 5))
matplotlib.pyplot.plot(t, N, label='Logistic Growth', color='green')
matplotlib.pyplot.title('Bacterial Growth Simulation')
matplotlib.pyplot.xlabel('Time (days)')
matplotlib.pyplot.ylabel('Population Size')
matplotlib.pyplot.axhline(y=K, color='red', linestyle='--', label='Carrying Capacity')
matplotlib.pyplot.legend()
matplotlib.pyplot.grid()
matplotlib.pyplot.show()

Biodiversity calculator

This is a very basic calculator for two common ways to represent biodiversity of a community: the Shannon-Weiner Index and the Simpon’s Index.

Expand to see the code

import math

def shannon_wiener_index(counts):
    total = sum(counts)
    proportions = [count / total for count in counts]
    H = -sum(p * math.log(p) for p in proportions if p > 0)
    return H

def simpsons_index(counts):
    total = sum(counts)
    proportions = [count / total for count in counts]
    D = 1 - sum(p ** 2 for p in proportions)
    return D

def diversity_index_calculator(counts):
    H = shannon_wiener_index(counts)  # Shannon-Wiener Index
    D = simpsons_index(counts) # Simpson's Index
    return H, D

species_counts = [5, 34, 23, 5, 11]  # Fill in your own counts
H, D = diversity_index_calculator(species_counts)

print(f"Shannon-Wiener Index: {H:.4f}")
print(f"Simpson's Index: {D:.4f}")

Biology Quiz

A simple quiz where you can add your own questions and answers. At the end it gives your score.

Expand to see the code

# Create your own questions and answers
questions = {
    "How many chromosomes has a human?": "46",
    "What is the most common CRISPR-associated nuclease?": "Cas9",
    "How many codons are there in the genetic code?": "64",
    "To what division of fungi do most mushrooms that are visible by eye belong?": "Basidiomycota"
}

# Score
score = 0

# Quiz
for question, correct_answer in questions.items():
    answer = input(question + " ")

    # Check answer
    if answer.strip().lower() == correct_answer.lower():
        print("Correct!")
        score += 1
    else:
        print("Wrong! The correct answer is", correct_answer)

# Display the final score
print(f"\nYour final score is {score} out of {len(questions)}.")