In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
import joblib # To save and load the model

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
            
documents = ["this is the first document", "this document is the second", "and this is the third one"]
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(documents)
print(vectorizer.vocabulary_)
print(X.toarray())

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]]


In [12]:
# Create a sample dataset
data = {
    'email': [
        "free money urgent offer",
        "hello how are you",
        "get free gift now",
        "meeting scheduled for today",
        "urgent action needed",
        "let's have a coffee",
        "claim your reward",
        "check my document attached"
    ],
    'label': [1, 0, 1, 0, 0, 0, 1, 0]  # 1 for spam, 0 for not spam
}
df = pd.DataFrame(data)

# Convert emails to binary feature vectors
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['email'])
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Bernoulli Naive Bayes model
model = BernoulliNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Prediction using loaded model: [0 0]


In [13]:
# Save the model
joblib.dump(model, 'bernoulli_naive_bayes_model.pkl')

# Load the model
loaded_model = joblib.load('bernoulli_naive_bayes_model.pkl')

# Predict using the loaded model
y_pred_loaded = loaded_model.predict(X_test)
print(f"Prediction using loaded model: {y_pred_loaded}")

Prediction using loaded model: [0 0]


In [14]:
feature_log_prob = model.feature_log_prob_ #probability of each feature
feature_names = vectorizer.get_feature_names_out()
for i, category in enumerate(model.classes_):
    print(f"Most significant words for class: {category}")
    indices = feature_log_prob[i,:].argsort()[::-1][:5] #top 5 most significant features
    print(feature_names[indices])

Most significant words for class: 0
['urgent' 'scheduled' 'today' 'my' 'needed']
Most significant words for class: 1
['free' 'urgent' 'your' 'money' 'reward']


## Hyper parameter tuning 

In [19]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}

# Perform grid search with cross validation
grid_search = GridSearchCV(BernoulliNB(), param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Train the final model with the best parameters
best_model = BernoulliNB(**grid_search.best_params_)
best_model.fit(X_train, y_train)

Best Parameters: {'alpha': 0.1}
Best Score: 0.5
