{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"758B-Lab-1-answer.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true,"authorship_tag":"ABX9TyNqcL4TjZ7KOpGzvyYJEy9Q"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"JOIWgMXLI-Og"},"source":["# Load data"]},{"cell_type":"code","metadata":{"id":"DHMQ89Hkvowj","executionInfo":{"status":"ok","timestamp":1601323106806,"user_tz":240,"elapsed":475,"user":{"displayName":"Kunpeng Zhang","photoUrl":"","userId":"09274433828486852799"}},"outputId":"8003abb8-a5af-41ae-8cd2-426930af28eb","colab":{"base_uri":"https://localhost:8080/","height":204}},"source":["# load text data and convert the label/sentiment into corresponding numeric values: 'positive':2,'neutral':1,'negative':0\n","# possible packages you might need are: pandas, numpy\n","\n","import pandas as pd\n","import numpy as np\n","\n","# read the training data\n","fname = 'facebook_comments.csv'\n","df_train = pd.read_csv(fname,header=None,names=['text','sentiment'],encoding='iso-8859-1',lineterminator='\\n')\n","sent = {'positive':2,'neutral':1,'negative':0}\n","df_train['labels'] = df_train['sentiment'].str.strip().map(sent)\n","\n","# get texts and labels\n","training_texts = df_train.text.values\n","labels = df_train.labels.values\n","\n","# show the first 5 records\n","df_train.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>text</th>\n","      <th>sentiment</th>\n","      <th>labels</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>Heres a single  to add  to Kindle. Just read t...</td>\n","      <td>neutral</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>If you tire of Non-Fiction.. Check out http://...</td>\n","      <td>neutral</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>Ghost of Round Island is supposedly nonfiction.</td>\n","      <td>neutral</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>Why is Barnes and Nobles version of the Kindle...</td>\n","      <td>negative</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>@Maria:  Do you mean the Nook?  Be careful  bo...</td>\n","      <td>positive</td>\n","      <td>2</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                                                text  sentiment  labels\n","0  Heres a single  to add  to Kindle. Just read t...    neutral       1\n","1  If you tire of Non-Fiction.. Check out http://...    neutral       1\n","2   Ghost of Round Island is supposedly nonfiction.     neutral       1\n","3  Why is Barnes and Nobles version of the Kindle...   negative       0\n","4  @Maria:  Do you mean the Nook?  Be careful  bo...   positive       2"]},"metadata":{"tags":[]},"execution_count":1}]},{"cell_type":"markdown","metadata":{"id":"OmLg3YKbv7wx"},"source":["# Preprocess dat"]},{"cell_type":"code","metadata":{"id":"6wSTBJNmv-zK","executionInfo":{"status":"ok","timestamp":1601323113847,"user_tz":240,"elapsed":990,"user":{"displayName":"Kunpeng Zhang","photoUrl":"","userId":"09274433828486852799"}},"outputId":"efcec717-5eec-487e-e622-0656d8a7ee87","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["# preprocess the loaded textual data, including removing stopwords, stemming, and tokenization, etc. \n","# represent each document (i.e., comment) using TF-IDF strategy. The features are the top frequent unigrams across all comments.\n","# possible packages you might need are: scikit-learn, numpy\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.preprocessing import OneHotEncoder\n","\n","# tokenize and create a document-feature matrix X and a label vector Y\n","vectorizer = TfidfVectorizer(stop_words='english',max_features=500)\n","instances = vectorizer.fit_transform(training_texts)\n","X = instances[:len(training_texts)] # training instances\n","Y = np.array(labels) # training labels\n","\n","'''\n","# if using MSE loss, the output for each instance needs to be a vector of output_dimension\n","onehot_encoder = OneHotEncoder(sparse=False)\n","Y = Y1.reshape(-1, 1)\n","Y = onehot_encoder.fit_transform(Y)\n","'''\n","\n","# print out the shape of X and Y\n","print(X.shape,',',Y.shape)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["(1999, 500) , (1999,)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"22C3Lcp1yOGI"},"source":["# Traditional Machine Learning Models: Random Forest"]},{"cell_type":"code","metadata":{"id":"kcbdokefyY8z","executionInfo":{"status":"ok","timestamp":1601323128401,"user_tz":240,"elapsed":3811,"user":{"displayName":"Kunpeng Zhang","photoUrl":"","userId":"09274433828486852799"}},"outputId":"e66cac3f-fe40-42dc-c1e7-84ad46e8dd14","colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["# using 10-fold cross-validation to show the prediction accuracy\n","# possible packages you might need are: scikit-learn, numpy\n","\n","from sklearn.model_selection import KFold\n","from sklearn.ensemble import RandomForestClassifier\n","\n","kfold = KFold(n_splits=10,shuffle=True,random_state=2020)\n","\n","rf_cvscores = [] # store accuracy score for each fold\n","rf_model = RandomForestClassifier(random_state=2020,max_depth=2,criterion='entropy')\n","\n","for train, test in kfold.split(X):\n","  rf_model.fit(X[train],Y[train])\n","  rf_acc = rf_model.score(X[test],Y[test])\n","  \n","  rf_cvscores.append(rf_acc)\n","\n","print(\"Random Forest - mean: %.4f%% (std: +/- %.4f%%)\" % (np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Random Forest - mean: 64.1332% (std: +/- 2.0919%)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"CYCKaBtoyj6u"},"source":["# Fully connected feedforward Neural Network"]},{"cell_type":"code","metadata":{"id":"Mny8GWD-oh2d"},"source":["# Design your own network with the following requirements:\n","# 1. Having dropout\n","# 2. Separate the dataset into training and validation (80-20%)\n","# 3. The prediction accuracy on the validation set should be at least 50% for this 3-class classification problem\n","\n","# possible packages you might need are: scikit-learn, numpy, torch\n","import numpy as np\n","import torch\n","import torch.nn as nn\n","import torch.nn.functional as F\n","from torch.utils.data import TensorDataset, DataLoader\n","\n","import torch.optim as optim"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Ei2_m41Ln_Lb"},"source":["- Build the train loader and validation loader\n"]},{"cell_type":"code","metadata":{"id":"2jfsk3DeolBq"},"source":["# convert your numpy array to TensorDataset and create a data loader for training and validation, respectively\n","# some hyperparameters: input dimension, output dimension, batch size, number of epochs, and the learning rate.\n","epochs = 5\n","lr = 1e-4\n","indim = X.shape[1]\n","outdim = 3\n","drate = 0.7\n","batch_size = 16\n","\n","X_tensor = torch.from_numpy(X.toarray())\n","Y_tensor = torch.from_numpy(Y)\n","\n","dataset = TensorDataset(X_tensor,Y_tensor)\n","train_size = int(0.8 * len(dataset))\n","val_size = len(dataset) - train_size\n","train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])\n","\n","train_loader = DataLoader(train_dataset,batch_size=batch_size, shuffle=True)\n","val_loader = DataLoader(val_dataset,batch_size=batch_size,shuffle=True)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"A99ugLxaooQA"},"source":["- Build the network"]},{"cell_type":"code","metadata":{"id":"FWN5nJm8oqWp","executionInfo":{"status":"ok","timestamp":1601323224400,"user_tz":240,"elapsed":354,"user":{"displayName":"Kunpeng Zhang","photoUrl":"","userId":"09274433828486852799"}},"outputId":"9f51e5ec-02ad-46b8-ed8c-b1f6c7958e03","colab":{"base_uri":"https://localhost:8080/","height":119}},"source":["# create your model/network\n","class SentimentNetwork(nn.Module):\n","\n","  def __init__(self, input_dim, output_dim, dropout_rate):\n","    \n","    super(SentimentNetwork,self).__init__()\n","    self.fc1 = nn.Linear(input_dim,100)\n","    self.dropout = nn.Dropout(p=dropout_rate)\n","    self.fc2 = nn.Linear(100,50)\n","    self.fc3 = nn.Linear(50,output_dim)\n","\n","  def forward(self,x):\n","    \n","    x = F.relu(self.fc1(x))\n","    x = self.dropout(x)\n","    x = F.relu(self.fc2(x))\n","    x = self.dropout(x)\n","    x = F.log_softmax(self.fc3(x))\n","\n","    return x\n","\n","# create a model\n","model = SentimentNetwork(indim,outdim,drate)\n","print(model)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["SentimentNetwork(\n","  (fc1): Linear(in_features=500, out_features=100, bias=True)\n","  (dropout): Dropout(p=0.7, inplace=False)\n","  (fc2): Linear(in_features=100, out_features=50, bias=True)\n","  (fc3): Linear(in_features=50, out_features=3, bias=True)\n",")\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"WcxiBfU8ovt9"},"source":["- Create a training function to train the model and an evaluation function to evaluate the performance on the separate validation set"]},{"cell_type":"code","metadata":{"id":"9LbFEi3Fo4c-"},"source":["# define a training process function\n","def train(model, train_loader, optimizer, criterion):\n","\n","  epoch_loss, epoch_acc = 0.0,0.0\n","\n","  model.train()\n","    \n","  for batch_X, batch_Y in train_loader:\n","    optimizer.zero_grad()             \n","    predictions = model(batch_X.float())\n","    loss = criterion(predictions, batch_Y.long())\n","    acc = np.sum(np.argmax(predictions.detach().numpy(),axis=1) == batch_Y.detach().numpy()) / batch_size\n","    loss.backward()     \n","    optimizer.step()\n","        \n","    epoch_loss += loss.item()\n","    epoch_acc += acc\n","        \n","  return epoch_loss / len(train_loader), epoch_acc / len(train_loader)\n","\n","# define a validation/evaluation process function\n","def evaluate(model, val_loader, criterion):\n","\n","  epoch_loss, epoch_acc = 0.0,0.0\n","\n","  model.eval()\n","    \n","  with torch.no_grad():\n","    for batch_X, batch_Y in val_loader:\n","      predictions = model(batch_X.float())        \n","      loss = criterion(predictions, batch_Y.long())\n","      acc = np.sum(np.argmax(predictions.detach().numpy(),axis=1) == batch_Y.detach().numpy()) / batch_size\n","\n","      epoch_loss += loss.item()\n","      epoch_acc += acc\n","        \n","  return epoch_loss / len(val_loader), epoch_acc / len(val_loader)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"lZrgCqguo-iI"},"source":["- Main starting point: train the model and evaluate the model"]},{"cell_type":"code","metadata":{"id":"cAoFU7Tyyp-d","executionInfo":{"status":"ok","timestamp":1601323257942,"user_tz":240,"elapsed":1402,"user":{"displayName":"Kunpeng Zhang","photoUrl":"","userId":"09274433828486852799"}},"outputId":"64baf3df-b783-4345-fa3e-8164dfc90e1b","colab":{"base_uri":"https://localhost:8080/","height":309}},"source":["# define the loss function and optimizer\n","criterion = nn.CrossEntropyLoss()\n","optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n","\n","# real training and evaluation process\n","for epoch in range(epochs):\n","  train_loss, train_acc = train(model, train_loader, optimizer, criterion)\n","  valid_loss, valid_acc = evaluate(model, val_loader, criterion)\n","    \n","  print(f'Epoch: {epoch+1:02}')\n","  print(f'\\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')\n","  print(f'\\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc:.4f}')"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n"],"name":"stderr"},{"output_type":"stream","text":["Epoch: 01\n","\tTrain Loss: 0.7994 | Train Acc: 0.6475\n","\t Val. Loss: 0.8105 |  Val. Acc: 0.6150\n","Epoch: 02\n","\tTrain Loss: 0.8040 | Train Acc: 0.6475\n","\t Val. Loss: 0.8024 |  Val. Acc: 0.6150\n","Epoch: 03\n","\tTrain Loss: 0.7895 | Train Acc: 0.6475\n","\t Val. Loss: 0.7940 |  Val. Acc: 0.6150\n","Epoch: 04\n","\tTrain Loss: 0.7817 | Train Acc: 0.6475\n","\t Val. Loss: 0.7853 |  Val. Acc: 0.6150\n","Epoch: 05\n","\tTrain Loss: 0.7686 | Train Acc: 0.6469\n","\t Val. Loss: 0.7758 |  Val. Acc: 0.6150\n"],"name":"stdout"}]}]}