# Load data

In [None]:
# load text data and convert the label/sentiment into corresponding numeric values: 'positive':2,'neutral':1,'negative':0
# possible packages you might need are: pandas, numpy

import pandas as pd
import numpy as np

# read the training data
fname = 'facebook_comments.csv'
df_train = pd.read_csv(fname,header=None,names=['text','sentiment'],encoding='iso-8859-1',lineterminator='\n')
sent = {'positive':2,'neutral':1,'negative':0}
df_train['labels'] = df_train['sentiment'].str.strip().map(sent)

# get texts and labels
training_texts = df_train.text.values
labels = df_train.labels.values

# show the first 5 records
df_train.head()

Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


# Preprocess dat

In [None]:
# preprocess the loaded textual data, including removing stopwords, stemming, and tokenization, etc. 
# represent each document (i.e., comment) using TF-IDF strategy. The features are the top frequent unigrams across all comments.
# possible packages you might need are: scikit-learn, numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

# tokenize and create a document-feature matrix X and a label vector Y
vectorizer = TfidfVectorizer(stop_words='english',max_features=500)
instances = vectorizer.fit_transform(training_texts)
X = instances[:len(training_texts)] # training instances
Y = np.array(labels) # training labels

'''
# if using MSE loss, the output for each instance needs to be a vector of output_dimension
onehot_encoder = OneHotEncoder(sparse=False)
Y = Y1.reshape(-1, 1)
Y = onehot_encoder.fit_transform(Y)
'''

# print out the shape of X and Y
print(X.shape,',',Y.shape)

(1999, 500) , (1999,)


# Traditional Machine Learning Models: Random Forest

In [None]:
# using 10-fold cross-validation to show the prediction accuracy
# possible packages you might need are: scikit-learn, numpy

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits=10,shuffle=True,random_state=2020)

rf_cvscores = [] # store accuracy score for each fold
rf_model = RandomForestClassifier(random_state=2020,max_depth=2,criterion='entropy')

for train, test in kfold.split(X):
  rf_model.fit(X[train],Y[train])
  rf_acc = rf_model.score(X[test],Y[test])
  
  rf_cvscores.append(rf_acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)" % (np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100))

Random Forest - mean: 64.1332% (std: +/- 2.0919%)


# Fully connected feedforward Neural Network

In [None]:
# Design your own network with the following requirements:
# 1. Having dropout
# 2. Separate the dataset into training and validation (80-20%)
# 3. The prediction accuracy on the validation set should be at least 50% for this 3-class classification problem

# possible packages you might need are: scikit-learn, numpy, torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import torch.optim as optim

- Build the train loader and validation loader


In [None]:
# convert your numpy array to TensorDataset and create a data loader for training and validation, respectively
# some hyperparameters: input dimension, output dimension, batch size, number of epochs, and the learning rate.
epochs = 5
lr = 1e-4
indim = X.shape[1]
outdim = 3
drate = 0.7
batch_size = 16

X_tensor = torch.from_numpy(X.toarray())
Y_tensor = torch.from_numpy(Y)

dataset = TensorDataset(X_tensor,Y_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset,batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=batch_size,shuffle=True)

- Build the network

In [None]:
# create your model/network
class SentimentNetwork(nn.Module):

  def __init__(self, input_dim, output_dim, dropout_rate):
    
    super(SentimentNetwork,self).__init__()
    self.fc1 = nn.Linear(input_dim,100)
    self.dropout = nn.Dropout(p=dropout_rate)
    self.fc2 = nn.Linear(100,50)
    self.fc3 = nn.Linear(50,output_dim)

  def forward(self,x):
    
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    x = F.relu(self.fc2(x))
    x = self.dropout(x)
    x = F.log_softmax(self.fc3(x))

    return x

# create a model
model = SentimentNetwork(indim,outdim,drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (dropout): Dropout(p=0.7, inplace=False)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
)


- Create a training function to train the model and an evaluation function to evaluate the performance on the separate validation set

In [None]:
# define a training process function
def train(model, train_loader, optimizer, criterion):

  epoch_loss, epoch_acc = 0.0,0.0

  model.train()
    
  for batch_X, batch_Y in train_loader:
    optimizer.zero_grad()             
    predictions = model(batch_X.float())
    loss = criterion(predictions, batch_Y.long())
    acc = np.sum(np.argmax(predictions.detach().numpy(),axis=1) == batch_Y.detach().numpy()) / batch_size
    loss.backward()     
    optimizer.step()
        
    epoch_loss += loss.item()
    epoch_acc += acc
        
  return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

# define a validation/evaluation process function
def evaluate(model, val_loader, criterion):

  epoch_loss, epoch_acc = 0.0,0.0

  model.eval()
    
  with torch.no_grad():
    for batch_X, batch_Y in val_loader:
      predictions = model(batch_X.float())        
      loss = criterion(predictions, batch_Y.long())
      acc = np.sum(np.argmax(predictions.detach().numpy(),axis=1) == batch_Y.detach().numpy()) / batch_size

      epoch_loss += loss.item()
      epoch_acc += acc
        
  return epoch_loss / len(val_loader), epoch_acc / len(val_loader)

- Main starting point: train the model and evaluate the model

In [None]:
# define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# real training and evaluation process
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
    
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc:.4f}')



Epoch: 01
	Train Loss: 0.7994 | Train Acc: 0.6475
	 Val. Loss: 0.8105 |  Val. Acc: 0.6150
Epoch: 02
	Train Loss: 0.8040 | Train Acc: 0.6475
	 Val. Loss: 0.8024 |  Val. Acc: 0.6150
Epoch: 03
	Train Loss: 0.7895 | Train Acc: 0.6475
	 Val. Loss: 0.7940 |  Val. Acc: 0.6150
Epoch: 04
	Train Loss: 0.7817 | Train Acc: 0.6475
	 Val. Loss: 0.7853 |  Val. Acc: 0.6150
Epoch: 05
	Train Loss: 0.7686 | Train Acc: 0.6469
	 Val. Loss: 0.7758 |  Val. Acc: 0.6150
