In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [7]:
# Download necessary nltk resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\delhi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [8]:
# Sample data
data = {'text': ["This is a good movie",
                 "This is a bad movie",
                 "I like this food",
                 "I dislike that food",
                 "good job",
                 "bad luck",
                 "I love this",
                 "I hate that",
                 "this is a masterpiece",
                 "this is not good"],
        'label': ['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative','positive','negative']}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,label
0,This is a good movie,positive
1,This is a bad movie,negative
2,I like this food,positive
3,I dislike that food,negative
4,good job,positive


In [10]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    filtered_tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(filtered_tokens)

df['text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,text,label
0,good movi,positive
1,bad movi,negative
2,like food,positive
3,dislik food,negative
4,good job,positive


In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)


In [12]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [13]:
# Train a Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

In [14]:
# Make predictions
y_pred = model.predict(X_test_vectorized)

In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification report: \n", report)

Accuracy: 0.3333333333333333
Classification report: 
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
    positive       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Save the model
filename = 'naive_bayes_model.pkl'
pickle.dump(model, open(filename, 'wb'))
pickle.dump(vectorizer, open("vectorizer.pkl", 'wb'))

In [17]:
# Load the model
loaded_model = pickle.load(open(filename, 'rb'))
loaded_vectorizer = pickle.load(open("vectorizer.pkl", 'rb'))
new_text = ["this was great", "this was horrible"]
new_text_vectorized = loaded_vectorizer.transform(new_text)
print("Loaded model prediction: ", loaded_model.predict(new_text_vectorized))

Loaded model prediction:  ['positive' 'positive']


## Hyperparameter Tuning

In [20]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary nltk resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Sample data
data = {'text': ["This is a good movie",
                 "This is a bad movie",
                 "I like this food",
                 "I dislike that food",
                 "good job",
                 "bad luck",
                 "I love this",
                 "I hate that",
                 "this is a masterpiece",
                 "this is not good"],
        'label': ['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative','positive','negative']}
df = pd.DataFrame(data)

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    filtered_tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(filtered_tokens)

df['text'] = df['text'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)


# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Define hyperparameter grid
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0, 3.0]}

# Grid search for hyperparameter tuning
grid = GridSearchCV(MultinomialNB(), param_grid, cv=2)
grid.fit(X_train_vectorized, y_train)
print(f"Best parameters for Naive Bayes: {grid.best_params_}")
print("Best Score for Naive Bayes: ", grid.best_score_)

# Evaluate the model
# Make predictions
y_pred = grid.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification report: \n", report)

Best parameters for Naive Bayes: {'alpha': 2.0}
Best Score for Naive Bayes:  0.25
Accuracy: 0.3333333333333333
Classification report: 
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
    positive       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
