Recently Published
Milestone Capstone
> install.packages("RWeka")
Installing package into ‘C:/Users/aarp/Documents/R/win-library/3.3’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.3/RWeka_0.4-34.zip'
Content type 'application/zip' length 538989 bytes (526 KB)
downloaded 526 KB
package ‘RWeka’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\aarp\AppData\Local\Temp\RtmpkLTbpE\downloaded_packages
> #Libraries and download data
> #Required libraries
> library(tm)
Loading required package: NLP
> library(ggplot2)
Need help? Try the ggplot2 mailing list:
http://groups.google.com/group/ggplot2.
Attaching package: ‘ggplot2’
The following object is masked from ‘package:NLP’:
annotate
> library(stringi)
> library(NLP)
> library(tm)
> library(RWeka)
Error : .onLoad failed in loadNamespace() for 'rJava', details:
call: fun(libname, pkgname)
error: No CurrentVersion entry in Software/JavaSoft registry! Try re-installing Java and make sure R and Java have matching architectures.
Error: package or namespace load failed for ‘RWeka’
> library(magrittr)
> library(SnowballC)
>
> #Download data
>
> setwd("C:/Users/aarp/Downloads")
> getwd()
[1] "C:/Users/aarp/Downloads"
>
> fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
> if (!file.exists("Coursera-SwiftKey.zip")){
download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl")
unzip("Coursera-SwiftKey.zip")
}
>
> #We will check the file size of the data
>
> filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
> filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
> filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
> fileSIZE <- rbind(filesz1, filesz2, filesz3)
>
> #Load the text files found in folder
>
> blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb")
> blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE)
> close(blogDirectory)
>
> newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb")
> news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE)
> close(newsDirectory)
>
> twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb")
> twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE)
> close(twitterDirectory)
>
>
> ##Exploratory Data analysis
> #Get the number of lines of each source of data
> length(blogs)
[1] 899288
>
> length(twitter)
[1] 2360148
>
> length(news)
[1] 1010242
>
>
> #Get the number of words per line of data and present the summary
>
> # Get number of words per line
> str1 <- "blogs"
> str1 <- "twitter"
> str1 <- "news"
>
> nwords.blogs <- str1 <- "blogs"
> nwords.blogs <- stri_count_words(blogs)
> nwords.twitter <- stri_count_words(twitter)
> nwords.news <- stri_count_words(news)
>
> #Data Summary
>
> summary(nwords.blogs)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 9.00 28.00 41.75 60.00 6726.00
>
> summary(nwords.twitter)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 7.00 12.00 12.75 18.00 47.00
>
> summary(nwords.news)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 19.00 32.00 34.41 46.00 1796.00
>
> wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news))
> print(wordcount)
[,1]
[1,] 37546246
[2,] 30093410
[3,] 34762395
>
>
> ##Exploration of a sample data
> #I prepare a random sample of 0.05% of total lines provided by the source data
>
> set.seed(10000)
> s_blogs <- sample(blogs, length(blogs)*0.0005)
> set.seed(10000)
> s_news <- sample(news, length(news)*0.0005)
> set.seed(10000)
> s_twitter <- sample(twitter, length(twitter)*0.0005)
>
>
> snwords.blogs <- stri_count_words(s_blogs)
> snwords.news <- stri_count_words(s_news)
> snwords.twitter <- stri_count_words(s_twitter)
>
>
> df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news), type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news)))
)
>
>
> #Plotting the density probability of the frequency of the number of words per line.
>
> ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500)
Warning message:
Removed 1 rows containing non-finite values (stat_density).
>
> #Create a corpus and clean it to see which words occur more often. I apply only for news source
>
> newsCorpus = Corpus(VectorSource(s_news))
> newsCorpus = tm_map(newsCorpus, content_transformer(tolower))
> newsCorpus = tm_map(newsCorpus, removePunctuation)
> newsCorpus = tm_map(newsCorpus, removeNumbers)
>
> newsDTM = TermDocumentMatrix(newsCorpus,
control = list(minWordLength = 1))
>
> mnews = as.matrix(newsDTM)
> newsOrder <- sort(rowSums(mnews), decreasing = TRUE)
>
> #Displaying which are the 10-top frequent words & 10-bottom words
>
> head(newsOrder, 10)
the and for that said was with are his but
952 448 179 168 119 110 104 86 78 66
>
> tail(newsOrder, 10)
casa chandler grande occurred peak
1 1 1 1 1
picacho typical weren’t year’’ “because
1 1 1 1 1
>
> ##Conclusions
> #I have performed an exploratory analysis. From the text mining for the dataset, we sampling the dataset to get the high frequency of occurence of words
> #After looking at the data there are some additional things
> #1More data cleansing and A good sampling
> #2Create a prediction model and Build an application.
Plot
Hamid
Capstone project
Source Code
> library(tm)
Loading required package: NLP
> library(ggplot2)
Need help? Try the ggplot2 mailing list:
http://groups.google.com/group/ggplot2.
Attaching package: ‘ggplot2’
The following object is masked from ‘package:NLP’:
annotate
> library(stringi)
> library(NLP)
> library(tm)
> library(RWeka)
Error : .onLoad failed in loadNamespace() for 'rJava', details:
call: fun(libname, pkgname)
error: No CurrentVersion entry in Software/JavaSoft registry! Try re-installing Java and make sure R and Java have matching architectures.
Error: package or namespace load failed for ‘RWeka’
> install.packages("RWeka")
Installing package into ‘C:/Users/aarp/Documents/R/win-library/3.3’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.3/RWeka_0.4-34.zip'
Content type 'application/zip' length 538924 bytes (526 KB)
downloaded 526 KB
package ‘RWeka’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\aarp\AppData\Local\Temp\RtmpwlU7rJ\downloaded_packages
> library(RWeka)
Error : .onLoad failed in loadNamespace() for 'rJava', details:
call: fun(libname, pkgname)
error: No CurrentVersion entry in Software/JavaSoft registry! Try re-installing Java and make sure R and Java have matching architectures.
Error: package or namespace load failed for ‘RWeka’
> library(magrittr)
> library(SnowballC)
> setwd("C:/Users/aarp/Downloads)
+ ""
+ setwd("C:/Users/aarp/Downloads")
Error: unexpected string constant in:
"""
setwd(""
> setwd("C:/Users/aarp/Downloads")
> getwd()
[1] "C:/Users/aarp/Downloads"
> fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
> if (!file.exists("Coursera-SwiftKey.zip")){
+ download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl")
+ unzip("Coursera-SwiftKey.zip")
+ }
> filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
> filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2
> filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
> fileSIZE <- rbind(filesz1, filesz2, filesz3)
>
> blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb")
> blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE)
> close(blogDirectory)
>
> newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb")
> news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE)
> close(newsDirectory)
>
> twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb")
> twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE)
> close(twitterDirectory)
>
> length(blogs)
[1] 899288
>
> length(twitter)
[1] 2360148
>
> length(news)
[1] 1010242
>
>
> str1 <- "blogs"
> str1 <- "twitter"
> str1 <- "news"
>
> summary(nwords.blogs)
Error in summary(nwords.blogs) : object 'nwords.blogs' not found
> nwords.blogs <- str1 <- "blogs"
> summary(nwords.blogs)
Length Class Mode
1 character character
> nwords.blogs <- stri_count_words(blogs)
> nwords.twitter <- stri_count_words(twitter)
> nwords.news <- stri_count_words(news)
>
> summary(nwords.blogs)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 9.00 28.00 41.75 60.00 6726.00
>
> summary(nwords.twitter)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 7.00 12.00 12.75 18.00 47.00
>
> summary(nwords.news)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 19.00 32.00 34.41 46.00 1796.00
>
> wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news))
> print(wordcount)
[,1]
[1,] 37546246
[2,] 30093410
[3,] 34762395
>
> set.seed(10000)
> s_blogs <- sample(blogs, length(blogs)*0.0005)
> set.seed(10000)
> s_news <- sample(news, length(news)*0.0005)
> set.seed(10000)
> s_twitter <- sample(twitter, length(twitter)*0.0005)
> snwords.blogs <- stri_count_words(s_blogs)
> snwords.news <- stri_count_words(s_news)
> snwords.twitter <- stri_count_words(s_twitter)
>
> df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news), type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news)))
+ )
>
> ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500)
Warning message:
Removed 1 rows containing non-finite values (stat_density).
>
> newsCorpus = Corpus(VectorSource(s_news))
> newsCorpus = tm_map(newsCorpus, content_transformer(tolower))
> newsCorpus = tm_map(newsCorpus, removePunctuation)
> newsCorpus = tm_map(newsCorpus, removeNumbers)
> newsDTM = TermDocumentMatrix(newsCorpus,
+ control = list(minWordLength = 1))
> mnews = as.matrix(newsDTM)
> newsOrder <- sort(rowSums(mnews), decreasing = TRUE)
>
> head(newsOrder, 10)
the and for that said was with are his but
952 448 179 168 119 110 104 86 78 66
>
> tail(newsOrder, 10)
casa chandler grande occurred peak
1 1 1 1 1
picacho typical weren’t year’’ “because
1 1 1 1 1
>
>
> #Conclusion
> #1 -More clean data and good sample
> #2 -Create a prediction model and Build an application
Cities i have lived in
Cities i have lived in