library(tidyverse)
library(here)
library(dplyr)
library(tidytuesdayR)
library(tidymodels)
library(ggplot2)
Tidy Tuesday Exercise
###Implement all necessary libraries/packages
###About the Data Set According to the TidyTuesday authors, the data set we will explore today is “a dataset collected at Hewlett-Packard Labs by Mark Hopkins, Erik Reeber, George Forman, and Jaap Suermondt and shared with the UCI Machine Learning Repository. The dataset classifies 4601 e-mails as spam or non-spam, with additional variables indicating the frequency of certain words and characters in the e-mail”.
###Load in the Data
<- tidytuesdayR::tt_load('2023-08-15') tuesdata
Downloading file 1 of 1: `spam.csv`
<- tuesdata$spam
spam spam
# A tibble: 4,601 × 7
crl.tot dollar bang money n000 make yesno
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 278 0 0.778 0 0 0 y
2 1028 0.18 0.372 0.43 0.43 0.21 y
3 2259 0.184 0.276 0.06 1.16 0.06 y
4 191 0 0.137 0 0 0 y
5 191 0 0.135 0 0 0 y
6 54 0 0 0 0 0 y
7 112 0.054 0.164 0 0 0 y
8 49 0 0 0 0 0 y
9 1257 0.203 0.181 0.15 0 0.15 y
10 749 0.081 0.244 0 0.19 0.06 y
# ℹ 4,591 more rows
###Cleaning the Data Before we begin exploring the data, lets clean up anything that might make it difficult to explore.
##This will make sure the yes/no column only contains y/n
unique(spam$yesno)
[1] "y" "n"
##Ensure that the other variables contain only numbers for their values
glimpse(spam$crl.tot)
num [1:4601] 278 1028 2259 191 191 ...
glimpse(spam$n000)
num [1:4601] 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
glimpse(spam$dollar)
num [1:4601] 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
glimpse(spam$bang)
num [1:4601] 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
glimpse(spam$money)
num [1:4601] 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
glimpse(spam$make)
num [1:4601] 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
##Since dollar, bang, money, and make are all percentages, let's make sure there are no values over 1 or below 0
%>% count(dollar > 1, dollar < 0) spam
# A tibble: 2 × 3
`dollar > 1` `dollar < 0` n
<lgl> <lgl> <int>
1 FALSE FALSE 4565
2 TRUE FALSE 36
%>% count(bang > 1, bang < 0) spam
# A tibble: 2 × 3
`bang > 1` `bang < 0` n
<lgl> <lgl> <int>
1 FALSE FALSE 4320
2 TRUE FALSE 281
%>% count(money > 1, money < 0) spam
# A tibble: 2 × 3
`money > 1` `money < 0` n
<lgl> <lgl> <int>
1 FALSE FALSE 4512
2 TRUE FALSE 89
%>% count(make > 1, make < 0) spam
# A tibble: 2 × 3
`make > 1` `make < 0` n
<lgl> <lgl> <int>
1 FALSE FALSE 4508
2 TRUE FALSE 93
##In each variable, there are some observations where their percentage value is greater than 1 which is not logical. We will remove these observations.
<- subset(spam, !(dollar > 1))
spam_clean <- subset(spam, !(bang >1))
spam_clean <- subset(spam, !(money >1))
spam_clean <- subset(spam, !(make >1))
spam_clean
##Cleaned Data Set
glimpse(spam_clean)
Rows: 4,508
Columns: 7
$ crl.tot <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, 1257, 749, 21, 184, 26…
$ dollar <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000, 0.054, 0.000, 0.203,…
$ bang <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000, 0.164, 0.000, 0.181,…
$ money <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.15, 0.00, 0.…
$ n000 <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.19, 0.…
$ make <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.15, 0.06, 0.…
$ yesno <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y…
###Explore the Data Let’s find out how many e-mails were flagged to be spam versus not spam.
%>% count(yesno) spam_clean
# A tibble: 2 × 2
yesno n
<chr> <int>
1 n 2737
2 y 1771
We are going to focus this exploration on the aspect of money being involved in spam e-mails. Let’s try comparing how many times the word ‘money’ appears compared to the ‘$’ sign.
%>% count(money == 0, money > 0) spam_clean
# A tibble: 2 × 3
`money == 0` `money > 0` n
<lgl> <lgl> <int>
1 FALSE TRUE 707
2 TRUE FALSE 3801
%>% count(dollar == 0, dollar > 0) spam_clean
# A tibble: 2 × 3
`dollar == 0` `dollar > 0` n
<lgl> <lgl> <int>
1 FALSE TRUE 1368
2 TRUE FALSE 3140
%>% count(money == 0, dollar == 0) spam_clean
# A tibble: 4 × 3
`money == 0` `dollar == 0` n
<lgl> <lgl> <int>
1 FALSE FALSE 591
2 FALSE TRUE 116
3 TRUE FALSE 777
4 TRUE TRUE 3024
We can see that there is approximately double the amount of e-mails that contain the dollar sign than the word money. In addition, of the approximately 1400 e-mails that contain the ‘$’, only 600 of them also contain the word ‘money’. This may indicate that it is highly likely that scammers find more success with the dollar sign.
Finally, let’s plot how many e-mails contain money and $ and whether they are marked as spam or not.
<- spam_clean
spam_clean_money $money[spam_clean_money$money > 0] <- 1
spam_clean_money$money[spam_clean_money$dollar > 0] <- 1
spam_clean_moneyglimpse(spam_clean_money$money)
num [1:4508] 0 1 1 0 0 0 1 0 1 1 ...
glimpse(spam_clean_money$dollar)
num [1:4508] 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
From here we could create a bar plot showing how much of a trend there is between the existence of ‘money’, ‘$’, and both in spam e-mails.