a<-c(1,2)
a[0]
a[1]
?cor
cor(c(1:10),c(1:10))
?cor
?boxplot
a<-c(1,2,3)
load("~/.RData")
a
b<-c(2,3,4)
q()
q()
library(ggplot2)
install.packages(ggplot2)
install.packages('ggplot2')
library(ggplot2)
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point(aes=(color=class))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point(aes=(colour=class))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point(aes(colour=class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
head(mpg)
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour = class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour(class)))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour(class))
)
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
summary(mpg)
sessionInfo()
warnings()
15106 %% 15
3091 %% 15
6693 %% 15
?heatmap
require(graphics); require(grDevices)
x  <- as.matrix(mtcars)
rc <- rainbow(nrow(x), start = 0, end = .3)
cc <- rainbow(ncol(x), start = 0, end = .3)
hv <- heatmap(x, col = cm.colors(256), scale = "column",
RowSideColors = rc, ColSideColors = cc, margins = c(5,10),
xlab = "specification variables", ylab =  "Car Models",
main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
utils::str(hv) # the two re-ordering index vectors
## no column dendrogram (nor reordering) at all:
heatmap(x, Colv = NA, col = cm.colors(256), scale = "column",
RowSideColors = rc, margins = c(5,10),
xlab = "specification variables", ylab =  "Car Models",
main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
head(x)
?heatmap
?heatmpa
?heatmap
c('a','b')[[7]]
c('a','b')[7]
list(a='b')['a']
sin(pi / 2)
x <- 3 * 4
x
x = 3*4
x
x <- 3
x <- 3 *4
x
divide <- function(numerator, denominator) { numerator/denominator }
divide(2,1)
divide(denominator=2,numerator=1)
divide(denominator<-2,numerator<-1)  # yields 2, a wrong answer
this_is_a_really_long_name <- 2.5
this_is_a_really_long_name
x<-1
good <- function() { x <- 5}
good()
print(x)
## [1] 1
bad <- function() { x <<- 5}
bad()
print(x)
## [1] 5
seq(1,10)
y <- seq(1, 10, length.out = 5)
(y <- seq(1, 10, length.out = 5))
y <- seq(1, 10, length.out = 5)
y
(y <- seq(1, 10, length.out = 5))
my_variable <- 10
my_varıable
library(tidyverse)
ggplot(dota = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
fliter(mpg, cyl = 8)
filter(diamond, carat > 3)
install.packages("tidyverse")
library(tidyverse)
ggplot(dota = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
fliter(mpg, cyl = 8)
filter(diamond, carat > 3)
ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
warnings()
fliter(mpg, cyl = 8)
filter(mpg, cyl = 8)
filter(mpg, cyl == 8)
filter(diamond, carat > 3)
filter(diamonds, carat > 3)
c(T,T,F,F) == c(T,F,T,F)
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) | c(T,F,T,F)
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) && c(T,F,T,F)
c(T,T,F,F) && c(F,T,F,T)
c(T,T,F,F) == c(T,F,T,F)
identical(c(T,T,F,F),c(T,F,T,F))
all.equal(c(T,T,F,F),c(T,F,T,F))
class(c(1,2))
ls()
vec <- c(1,2)
fun <- function(v) { v[[2]]<-5; print(v)}
fun(vec)
print(vec)
1/5
3/5-2/5
1/5==3/5-2/5
sprintf("%.20f",1/5)
sprintf("%.20f",3/5-2/5)
all.equal(1/5,3/5-2/5)
1:2*5
1:(2*5)
rep(1,10)
rep(10,1)
a<-c(1:10)
length(a)
a[1]
a[[1]]
a[11]
a[[11]]
b<-c()
length(b)
is.null(b)
is.na(b)
c(6,'fred')
list(6,'fred')
x <- list('a'=6,b='fred')
names(x)
x$a
x$b
x[['a']]
c('a','b')[[7]]
c('a','b')[7]
c('a','b')[[7]]
c('a','b')[1]
c('a','b')[[1]]
c('a','b')[1]
list(a='b')['a']
list(a='b')[['a']]
class(list(a='b')['a'])
class(list(a='b')[['a'])
class(list(a='b')[['a']])
list(a='b')[c('a','a')]
list(a='b')[[c('a','a')]]
b<-matrix(c(2,4,3,1,5,7), nrow=3,ncol=2)
b[1,2]
b[2,1]
b
t(b)
cbind(b, b)
rbind(b, b)
d = data.frame(x=c(1,2,3), y=c('x','y','z'))
d
str(d)
subset(d,c(T,F,T))
d
str(d)
factor('red',levels=c('red','orange'))
factor('apple',levels=c('red','orange'))
uciCar <- read.table(  	# Note: 1
'http://www.win-vector.com/dfiles/car.data.csv', 	# Note: 2
sep=',', 	# Note: 3
header=T 	# Note: 4
)
head(uciCar)
max(uciCar$doors)
max(as.integer(uciCar$doors))
str(uciCar)
class(uciCar)
summary(uciCar)
dim(uciCar)
summary(as.integer(uciCar$doors))
d <- read.table(paste('http://archive.ics.uci.edu/ml/','machine-learning-databases/statlog/german/german.data',sep=''),stringsAsFactors=F,header=F)
d
d <- read.table("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/105.2/codes/code03/german.data.txt",stringsAsFactors=F,header=F)
d
colnames(d) <- c('Status.of.existing.checking.account',
'Duration.in.month',  'Credit.history', 'Purpose',
'Credit.amount', 'Savings account/bonds',
'Present.employment.since',
'Installment.rate.in.percentage.of.disposable.income',
'Personal.status.and.sex', 'Other.debtors/guarantors',
'Present.residence.since', 'Property', 'Age.in.years',
'Other.installment.plans', 'Housing',
'Number.of.existing.credits.at.this.bank', 'Job',
'Number.of.people.being.liable.to.provide.maintenance.for',
'Telephone', 'foreign.worker', 'Good.Loan')
d$Good.Loan <- as.factor(ifelse(d$Good.Loan==1,'GoodLoan','BadLoan'))
mapping <- list(
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television',
'A44'='domestic appliances')
head(d)
for(i in 1:(dim(d))[2]) {             	# Note: 1
if(class(d[,i])=='character') {
d[,i] <- as.factor(as.character(mapping[d[,i]]))  	# Note: 2
}
}
summary(d$Purpose)
table(d$Purpose,d$Good.Loan)
2^15
2^14
?axis
?png
?log
log(0)
double.xmin
10-e10
e-10
10^-10
head(mtcars)
str(mtcars)
mtcars$mpg
hist(mtcars$mpg)
d <- density(mtcars$mpg) # returns the density data
plot(d)
?density
library(ggplot2)
ggplot(data=chol, aes(chol$AGE)) + geom_histogram()
ggplot(data=mtcars, aes(mtcars$mpg)) + geom_histogram()
r()
q()
1/5
##[1] 0.2
3/5-2/5
##[1] 0.2
1/5==3/5-2/5
##[1] FALSE
sprintf("%.20f",1/5)
##[1] "0.20000000000000001110”
sprintf("%.20f",3/5-2/5)
##[1] "0.19999999999999995559"
all.equal(1/5,3/5-2/5)
##[1] TRUE
3/5
1/5
##[1] 0.2
3/5-2/5
##[1] 0.2
1/5==3/5-2/5
##[1] FALSE
sprintf("%.20f",1/5)
##[1] "0.20000000000000001110”
sprintf("%.20f",3/5-2/5)
##[1] "0.19999999999999995559"
all.equal(1/5,3/5-2/5)
##[1] TRUE
?plot
x<-1:10
plot(x,x)
1/5
##[1] 0.2
3/5-2/5
##[1] 0.2
1/5==3/5-2/5
##[1] FALSE
sprintf("%.20f",1/5)
##[1] "0.20000000000000001110”
sprintf("%.20f",3/5-2/5)
##[1] "0.19999999999999995559"
all.equal(1/5,3/5-2/5)
##[1] TRUE
1/5
3/5-2/5
1/5==3/5-2/5
sprintf("%.20f",1/5)
sprintf("%.20f",3/5-2/5)
all.equal(1/5,3/5-2/5)
z<-(1>0)?"a":"b"
11000000-10720000
(11000000-10720000)/160000
?polygon
2^8
2^7
2^3
log10(2.57646712135004e-05)
49548335/1000000
49548335/10000000
49548335/1000000
log10(49548335/1000000)
2.57646712135004e-05
1/3575368
324000/4=
c
324000/5
324000/5/10
6500x5
6500*5
6500*5*10
7000*5*10
75000*4
75000*4*3/4
?hluster
x<-1
good <- function() { x <- 5}
good()
print(x)
bad <- function() { x <<- 5}
bad()
print(x)
y <- seq(1, 10, length.out = 5)
(y <- seq(1, 10, length.out = 5))
y <- seq(1, 10, length.out = 5)
(y <- seq(1, 10, length.out = 5))
c(T,T,F,F) == c(T,F,T,F)
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) | c(T,F,T,F)
c(T,T,F,F) && c(T,F,T,F)
c(T,T,F,F) == c(T,F,T,F)
all.equal(1/5,3/5-2/5)
x=matrix (data=c(1,2,3,4) , nrow=2, ncol =2)
matrix (data=c(1,2,3,4) , nrow=2, ncol =2)
matrix (c(1,2,3,4) ,2,2,byrow =TRUE)
?cor
rnorm (50)
rnorm (50)
rnorm (50)
set.seed (1303)
rnorm (50)
rnorm (50)
rnorm (50)
set.seed (1303)
rnorm (50)
set.seed (1303)
rnorm (50)
x <- 3 * 4
x
x = 3 * 4
x
divide <- function(numerator, denominator) { numerator/denominator }
divide(2,1)
divide(denominator=2,numerator=1)
divide(denominator<-2,numerator<-1)
this_is_a_really_long_name <- 2.5
seq(1, 10)
y <- seq(1, 10, length.out = 5)
y
(y <- seq(1, 10, length.out = 5))
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) && c(T,F,T,F)
rep(1,10)
rep(10,1)
b<-c()
length(b)
is.null(b)
is.na(b)
?cor
c(6,'fred')
list(6,'fred')
b<-matrix(c(2,4,3,1,5,7), nrow=3,ncol=2)
b<-matrix(c(2,4,3,1,5,7), nrow=3,ncol=2)
b
(b<-matrix(c(2,4,3,1,5,7), nrow=3,ncol=2))
b
b[1,2]
b[2,1]
uciCar <- read.table(  	# Note: 1
'http://www.win-vector.com/dfiles/car.data.csv', 	# Note: 2
sep=',', 	# Note: 3
header=T 	# Note: 4
)
summary(uciCar)
str(uciCar)
324,000/5
324000/5
324000/5
d <- read.table("~/Downloads/Advertising.csv")
head(d)
d <- read.table("~/Downloads/Advertising.csv",sep=",")
head(d)
d <- read.table("~/Downloads/Advertising.csv",sep=",",header = T)
head(d)
d <- read.table("~/Downloads/Advertising.csv",sep=",",header = T)
head(d)
summary(d)
x<-summary(d)
x
str(x)
x[1,2]
x[1,3]
x[3,3]
x[3,]
?summary
median(d)
summary(d)
x
x[3,]
max(d)
324,000/5
324000/5
324000/10
(900/4)*6
?hclust
1024+64+16+8+2+1
0.15*8
library(ggbiplot)
?ggbiplot
data(iris)
# log transform
log.ir <- log(iris[, 1:4])
ir.species <- iris[, 5]
# apply PCA - scale. = TRUE is highly advisable, but default is FALSE.
ir.pca <- prcomp(log.ir,center = TRUE, scale. = TRUE)
library(ggbiplot)
g <- ggbiplot(ir.pca, obs.scale = 1, var.scale = 1, groups = ir.species)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal', legend.position = 'top')
print(g)
library(knitr)
knit('~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/1061/codes/code10/simple.Rmd')
pwd
(7*5+14*4+1*2)/22
(7*5+14*4+1*2)/22 * 20
setwd("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/1061/codes/code11")
setwd("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/1061/codes/code11/protein/")
# read data
protein <- read.table("protein.txt", sep="\t", header=TRUE)
# scale your data
vars.to.use <- colnames(protein)[-1]
pmatrix <- scale(protein[,vars.to.use])
pcenter <- attr(pmatrix, "scaled:center")
pscale <- attr(pmatrix, "scaled:scale")
library(reshape2)
clustcrit <- ch_criterion(pmatrix, 10, method="hclust")
critframe <- data.frame(k=1:10, ch=scale(clustcrit$crit), wss=scale(clustcrit$wss))
critframe <- melt(critframe, id.vars=c("k"), variable.name="measure", value.name="score")
ggplot(critframe, aes(x=k, y=score, color=measure)) +
geom_point(aes(shape=measure)) + geom_line(aes(linetype=measure)) +
scale_x_continuous(breaks=1:10, labels=1:10)
library(ggplot2)
clustcrit <- ch_criterion(pmatrix, 10, method="hclust")
source("../CH.R")
source("../WSS.R")
clustcrit <- ch_criterion(pmatrix, 10, method="hclust")
setwd("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/1061/codes/code11/books//")
library(arules)
bookbaskets <- read.transactions("bookdata.tsv.gz", format="single",
sep="\t", cols=c("userid", "title"), rm.duplicates=T)
class(bookbaskets)
bookbaskets
dim(bookbaskets)
colnames(bookbaskets)[1:5]
rownames(bookbaskets)[1:5]
# the distribution of transaction sizes
basketSizes <- size(bookbaskets)
summary(basketSizes)
quantile(basketSizes, probs=seq(0,1,0.1))
library(ggplot2)
ggplot(data.frame(count=basketSizes)) +
geom_density(aes(x=count), binwidth=1) +	scale_x_log10()
quantile(basketSizes, probs=seq(0,1,0.1))
library(ggplot2)
ggplot(data.frame(count=basketSizes))
+ geom_density(aes(x=count), binwidth=1)
+ scale_x_log10()
quantile(basketSizes, probs=seq(0,1,0.1))
library(ggplot2)
ggplot(data.frame(count=basketSizes))+ geom_density(aes(x=count), binwidth=1)+ scale_x_log10()
bookFreq <- itemFrequency(bookbaskets)
sum(bookFreq)
bookCount <- (bookFreq/sum(bookFreq))*sum(basketSizes)
summary(bookCount)
orderedBooks <- sort(bookCount, decreasing=T)
orderedBooks[1:10]
orderedBooks[1]/dim(bookbaskets)[1]
bookbaskets_use <- bookbaskets[basketSizes > 1]
dim(bookbaskets_use)
dim(bookbaskets)
100/dim(bookbaskets_use)[1]
rules <- apriori(bookbaskets_use, parameter =list(support = 0.002, confidence=0.75))
summary(rules)
measures <- interestMeasure(rules,
measure=c("coverage", "fishersExactTest"),
transactions=bookbaskets_use)
inspect(head((sort(rules, by="confidence")), n=5))
brules <- apriori(bookbaskets_use,
parameter =list(support = 0.001, confidence=0.6),
appearance=list(rhs=c("The Lovely Bones: A Novel"),
default="lhs"))
summary(brules)
brulesConf <- sort(brules, by="confidence")
inspect(head(lhs(brulesConf), n=5))
pclusters <- kmeans(pmatrix, kbest.p, nstart=100, iter.max=100)
pclusters$ tot .withinss
pclusters <- kmeans(pmatrix, kbest.p, nstart=1, iter.max=100)
pclusters$ tot .withinss
d <- dist(pmatrix, method="euclidean")
pfit <- hclust(d,method="complete")
plot(pfit, labels=protein$Country)
x=matrix (rnorm (30*3) , ncol =3)
dd=as.dist(1- cor(t(x)))
plot(hclust (dd, method =" complete "), main=" Complete Linkage with Correlation -Based Distance ", xlab="", sub ="")
x=matrix (rnorm (30*3) , ncol =3)
dd=as.dist(1- cor(t(x)))
plot(hclust (dd, method ="complete"), main=" Complete Linkage with Correlation -Based Distance ", xlab="", sub ="")
