RPubs

by RStudio

junejoh

Hamid

Recently Published

Why you should visit Pakistan once in your lifetime

over 8 years ago

> install.packages("RWeka") Installing package into ‘C:/Users/aarp/Documents/R/win-library/3.3’ (as ‘lib’ is unspecified) trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.3/RWeka_0.4-34.zip' Content type 'application/zip' length 538989 bytes (526 KB) downloaded 526 KB package ‘RWeka’ successfully unpacked and MD5 sums checked The downloaded binary packages are in C:\Users\aarp\AppData\Local\Temp\RtmpkLTbpE\downloaded_packages > #Libraries and download data > #Required libraries > library(tm) Loading required package: NLP > library(ggplot2) Need help? Try the ggplot2 mailing list: http://groups.google.com/group/ggplot2. Attaching package: ‘ggplot2’ The following object is masked from ‘package:NLP’: annotate > library(stringi) > library(NLP) > library(tm) > library(RWeka) Error : .onLoad failed in loadNamespace() for 'rJava', details: call: fun(libname, pkgname) error: No CurrentVersion entry in Software/JavaSoft registry! Try re-installing Java and make sure R and Java have matching architectures. Error: package or namespace load failed for ‘RWeka’ > library(magrittr) > library(SnowballC) > > #Download data > > setwd("C:/Users/aarp/Downloads") > getwd() [1] "C:/Users/aarp/Downloads" > > fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip" > if (!file.exists("Coursera-SwiftKey.zip")){ download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl") unzip("Coursera-SwiftKey.zip") } > > #We will check the file size of the data > > filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2 > filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2 > filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2 > fileSIZE <- rbind(filesz1, filesz2, filesz3) > > #Load the text files found in folder > > blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb") > blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE) > close(blogDirectory) > > newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb") > news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE) > close(newsDirectory) > > twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb") > twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE) > close(twitterDirectory) > > > ##Exploratory Data analysis > #Get the number of lines of each source of data > length(blogs) [1] 899288 > > length(twitter) [1] 2360148 > > length(news) [1] 1010242 > > > #Get the number of words per line of data and present the summary > > # Get number of words per line > str1 <- "blogs" > str1 <- "twitter" > str1 <- "news" > > nwords.blogs <- str1 <- "blogs" > nwords.blogs <- stri_count_words(blogs) > nwords.twitter <- stri_count_words(twitter) > nwords.news <- stri_count_words(news) > > #Data Summary > > summary(nwords.blogs) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 9.00 28.00 41.75 60.00 6726.00 > > summary(nwords.twitter) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.00 7.00 12.00 12.75 18.00 47.00 > > summary(nwords.news) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.00 19.00 32.00 34.41 46.00 1796.00 > > wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news)) > print(wordcount) [,1] [1,] 37546246 [2,] 30093410 [3,] 34762395 > > > ##Exploration of a sample data > #I prepare a random sample of 0.05% of total lines provided by the source data > > set.seed(10000) > s_blogs <- sample(blogs, length(blogs)*0.0005) > set.seed(10000) > s_news <- sample(news, length(news)*0.0005) > set.seed(10000) > s_twitter <- sample(twitter, length(twitter)*0.0005) > > > snwords.blogs <- stri_count_words(s_blogs) > snwords.news <- stri_count_words(s_news) > snwords.twitter <- stri_count_words(s_twitter) > > > df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news), type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news))) ) > > > #Plotting the density probability of the frequency of the number of words per line. > > ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500) Warning message: Removed 1 rows containing non-finite values (stat_density). > > #Create a corpus and clean it to see which words occur more often. I apply only for news source > > newsCorpus = Corpus(VectorSource(s_news)) > newsCorpus = tm_map(newsCorpus, content_transformer(tolower)) > newsCorpus = tm_map(newsCorpus, removePunctuation) > newsCorpus = tm_map(newsCorpus, removeNumbers) > > newsDTM = TermDocumentMatrix(newsCorpus, control = list(minWordLength = 1)) > > mnews = as.matrix(newsDTM) > newsOrder <- sort(rowSums(mnews), decreasing = TRUE) > > #Displaying which are the 10-top frequent words & 10-bottom words > > head(newsOrder, 10) the and for that said was with are his but 952 448 179 168 119 110 104 86 78 66 > > tail(newsOrder, 10) casa chandler grande occurred peak 1 1 1 1 1 picacho typical werenâ€™t yearâ€™â€™ â€œbecause 1 1 1 1 1 > > ##Conclusions > #I have performed an exploratory analysis. From the text mining for the dataset, we sampling the dataset to get the high frequency of occurence of words > #After looking at the data there are some additional things > #1More data cleansing and A good sampling > #2Create a prediction model and Build an application.

over 8 years ago

Hamid - Milestone Report Capstone

over 8 years ago

Plot

Hamid

over 8 years ago

Hamid - Milestone Report Capstone

over 8 years ago

Capstone project

Source Code > library(tm) Loading required package: NLP > library(ggplot2) Need help? Try the ggplot2 mailing list: http://groups.google.com/group/ggplot2. Attaching package: ‘ggplot2’ The following object is masked from ‘package:NLP’: annotate > library(stringi) > library(NLP) > library(tm) > library(RWeka) Error : .onLoad failed in loadNamespace() for 'rJava', details: call: fun(libname, pkgname) error: No CurrentVersion entry in Software/JavaSoft registry! Try re-installing Java and make sure R and Java have matching architectures. Error: package or namespace load failed for ‘RWeka’ > install.packages("RWeka") Installing package into ‘C:/Users/aarp/Documents/R/win-library/3.3’ (as ‘lib’ is unspecified) trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.3/RWeka_0.4-34.zip' Content type 'application/zip' length 538924 bytes (526 KB) downloaded 526 KB package ‘RWeka’ successfully unpacked and MD5 sums checked The downloaded binary packages are in C:\Users\aarp\AppData\Local\Temp\RtmpwlU7rJ\downloaded_packages > library(RWeka) Error : .onLoad failed in loadNamespace() for 'rJava', details: call: fun(libname, pkgname) error: No CurrentVersion entry in Software/JavaSoft registry! Try re-installing Java and make sure R and Java have matching architectures. Error: package or namespace load failed for ‘RWeka’ > library(magrittr) > library(SnowballC) > setwd("C:/Users/aarp/Downloads) + "" + setwd("C:/Users/aarp/Downloads") Error: unexpected string constant in: """ setwd("" > setwd("C:/Users/aarp/Downloads") > getwd() [1] "C:/Users/aarp/Downloads" > fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip" > if (!file.exists("Coursera-SwiftKey.zip")){ + download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl") + unzip("Coursera-SwiftKey.zip") + } > filesz1 <- file.info("final/en_US/en_US.blogs.txt")$size / 1024^2 > filesz2 <- file.info("final/en_US/en_US.news.txt")$size / 1024^2 > filesz3 <- file.info("final/en_US/en_US.twitter.txt")$size / 1024^2 > fileSIZE <- rbind(filesz1, filesz2, filesz3) > > blogDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "rb") > blogs <- readLines(blogDirectory, encoding = "UTF-8",skipNul = TRUE) > close(blogDirectory) > > newsDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt", "rb") > news <- readLines(newsDirectory, encoding = "UTF-8",skipNul = TRUE) > close(newsDirectory) > > twitterDirectory <- file("C:/Users/aarp/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "rb") > twitter <- readLines(twitterDirectory, encoding = "UTF-8",skipNul = TRUE) > close(twitterDirectory) > > length(blogs) [1] 899288 > > length(twitter) [1] 2360148 > > length(news) [1] 1010242 > > > str1 <- "blogs" > str1 <- "twitter" > str1 <- "news" > > summary(nwords.blogs) Error in summary(nwords.blogs) : object 'nwords.blogs' not found > nwords.blogs <- str1 <- "blogs" > summary(nwords.blogs) Length Class Mode 1 character character > nwords.blogs <- stri_count_words(blogs) > nwords.twitter <- stri_count_words(twitter) > nwords.news <- stri_count_words(news) > > summary(nwords.blogs) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 9.00 28.00 41.75 60.00 6726.00 > > summary(nwords.twitter) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.00 7.00 12.00 12.75 18.00 47.00 > > summary(nwords.news) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.00 19.00 32.00 34.41 46.00 1796.00 > > wordcount <- rbind(sum(nwords.blogs), sum(nwords.twitter), sum(nwords.news)) > print(wordcount) [,1] [1,] 37546246 [2,] 30093410 [3,] 34762395 > > set.seed(10000) > s_blogs <- sample(blogs, length(blogs)*0.0005) > set.seed(10000) > s_news <- sample(news, length(news)*0.0005) > set.seed(10000) > s_twitter <- sample(twitter, length(twitter)*0.0005) > snwords.blogs <- stri_count_words(s_blogs) > snwords.news <- stri_count_words(s_news) > snwords.twitter <- stri_count_words(s_twitter) > > df.nwords.all <- data.frame(nword = c(snwords.blogs, snwords.twitter, snwords.news), type = c(rep("blog", length(snwords.blogs)), rep("twitter",length(snwords.twitter)), rep("news", length(snwords.news))) + ) > > ggplot(data = df.nwords.all) + geom_density(aes(nword)) + facet_wrap(~type, nrow = 3) + xlim(0,500) Warning message: Removed 1 rows containing non-finite values (stat_density). > > newsCorpus = Corpus(VectorSource(s_news)) > newsCorpus = tm_map(newsCorpus, content_transformer(tolower)) > newsCorpus = tm_map(newsCorpus, removePunctuation) > newsCorpus = tm_map(newsCorpus, removeNumbers) > newsDTM = TermDocumentMatrix(newsCorpus, + control = list(minWordLength = 1)) > mnews = as.matrix(newsDTM) > newsOrder <- sort(rowSums(mnews), decreasing = TRUE) > > head(newsOrder, 10) the and for that said was with are his but 952 448 179 168 119 110 104 86 78 66 > > tail(newsOrder, 10) casa chandler grande occurred peak 1 1 1 1 1 picacho typical werenâ€™t yearâ€™â€™ â€œbecause 1 1 1 1 1 > > > #Conclusion > #1 -More clean data and good sample > #2 -Create a prediction model and Build an application

over 8 years ago