layout.show(12)
?text
?legend
?title
?legend
a<-c(1,2)
a[0]
a[1]
?cor
cor(c(1:10),c(1:10))
?cor
?boxplot
a<-c(1,2,3)
load("~/.RData")
a
b<-c(2,3,4)
q()
q()
library(ggplot2)
install.packages(ggplot2)
install.packages('ggplot2')
library(ggplot2)
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point(aes=(color=class))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point(aes=(colour=class))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point(aes(colour=class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
head(mpg)
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour = class))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour(class)))
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour(class))
)
ggplot(mpg, aes(x= displ, y= hwy)) + geom_point(aes(colour=class))
summary(mpg)
sessionInfo()
warnings()
15106 %% 15
3091 %% 15
6693 %% 15
?heatmap
require(graphics); require(grDevices)
x  <- as.matrix(mtcars)
rc <- rainbow(nrow(x), start = 0, end = .3)
cc <- rainbow(ncol(x), start = 0, end = .3)
hv <- heatmap(x, col = cm.colors(256), scale = "column",
RowSideColors = rc, ColSideColors = cc, margins = c(5,10),
xlab = "specification variables", ylab =  "Car Models",
main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
utils::str(hv) # the two re-ordering index vectors
## no column dendrogram (nor reordering) at all:
heatmap(x, Colv = NA, col = cm.colors(256), scale = "column",
RowSideColors = rc, margins = c(5,10),
xlab = "specification variables", ylab =  "Car Models",
main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
head(x)
?heatmap
?heatmpa
?heatmap
c('a','b')[[7]]
c('a','b')[7]
list(a='b')['a']
sin(pi / 2)
x <- 3 * 4
x
x = 3*4
x
x <- 3
x <- 3 *4
x
divide <- function(numerator, denominator) { numerator/denominator }
divide(2,1)
divide(denominator=2,numerator=1)
divide(denominator<-2,numerator<-1)  # yields 2, a wrong answer
this_is_a_really_long_name <- 2.5
this_is_a_really_long_name
x<-1
good <- function() { x <- 5}
good()
print(x)
## [1] 1
bad <- function() { x <<- 5}
bad()
print(x)
## [1] 5
seq(1,10)
y <- seq(1, 10, length.out = 5)
(y <- seq(1, 10, length.out = 5))
y <- seq(1, 10, length.out = 5)
y
(y <- seq(1, 10, length.out = 5))
my_variable <- 10
my_varıable
library(tidyverse)
ggplot(dota = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
fliter(mpg, cyl = 8)
filter(diamond, carat > 3)
install.packages("tidyverse")
library(tidyverse)
ggplot(dota = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
fliter(mpg, cyl = 8)
filter(diamond, carat > 3)
ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
warnings()
fliter(mpg, cyl = 8)
filter(mpg, cyl = 8)
filter(mpg, cyl == 8)
filter(diamond, carat > 3)
filter(diamonds, carat > 3)
c(T,T,F,F) == c(T,F,T,F)
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) | c(T,F,T,F)
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) && c(T,F,T,F)
c(T,T,F,F) && c(F,T,F,T)
c(T,T,F,F) == c(T,F,T,F)
identical(c(T,T,F,F),c(T,F,T,F))
all.equal(c(T,T,F,F),c(T,F,T,F))
class(c(1,2))
ls()
vec <- c(1,2)
fun <- function(v) { v[[2]]<-5; print(v)}
fun(vec)
print(vec)
1/5
3/5-2/5
1/5==3/5-2/5
sprintf("%.20f",1/5)
sprintf("%.20f",3/5-2/5)
all.equal(1/5,3/5-2/5)
1:2*5
1:(2*5)
rep(1,10)
rep(10,1)
a<-c(1:10)
length(a)
a[1]
a[[1]]
a[11]
a[[11]]
b<-c()
length(b)
is.null(b)
is.na(b)
c(6,'fred')
list(6,'fred')
x <- list('a'=6,b='fred')
names(x)
x$a
x$b
x[['a']]
c('a','b')[[7]]
c('a','b')[7]
c('a','b')[[7]]
c('a','b')[1]
c('a','b')[[1]]
c('a','b')[1]
list(a='b')['a']
list(a='b')[['a']]
class(list(a='b')['a'])
class(list(a='b')[['a'])
class(list(a='b')[['a']])
list(a='b')[c('a','a')]
list(a='b')[[c('a','a')]]
b<-matrix(c(2,4,3,1,5,7), nrow=3,ncol=2)
b[1,2]
b[2,1]
b
t(b)
cbind(b, b)
rbind(b, b)
d = data.frame(x=c(1,2,3), y=c('x','y','z'))
d
str(d)
subset(d,c(T,F,T))
d
str(d)
factor('red',levels=c('red','orange'))
factor('apple',levels=c('red','orange'))
uciCar <- read.table(  	# Note: 1
'http://www.win-vector.com/dfiles/car.data.csv', 	# Note: 2
sep=',', 	# Note: 3
header=T 	# Note: 4
)
head(uciCar)
max(uciCar$doors)
max(as.integer(uciCar$doors))
str(uciCar)
class(uciCar)
summary(uciCar)
dim(uciCar)
summary(as.integer(uciCar$doors))
d <- read.table(paste('http://archive.ics.uci.edu/ml/','machine-learning-databases/statlog/german/german.data',sep=''),stringsAsFactors=F,header=F)
d
d <- read.table("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/105.2/codes/code03/german.data.txt",stringsAsFactors=F,header=F)
d
colnames(d) <- c('Status.of.existing.checking.account',
'Duration.in.month',  'Credit.history', 'Purpose',
'Credit.amount', 'Savings account/bonds',
'Present.employment.since',
'Installment.rate.in.percentage.of.disposable.income',
'Personal.status.and.sex', 'Other.debtors/guarantors',
'Present.residence.since', 'Property', 'Age.in.years',
'Other.installment.plans', 'Housing',
'Number.of.existing.credits.at.this.bank', 'Job',
'Number.of.people.being.liable.to.provide.maintenance.for',
'Telephone', 'foreign.worker', 'Good.Loan')
d$Good.Loan <- as.factor(ifelse(d$Good.Loan==1,'GoodLoan','BadLoan'))
mapping <- list(
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television',
'A44'='domestic appliances')
head(d)
for(i in 1:(dim(d))[2]) {             	# Note: 1
if(class(d[,i])=='character') {
d[,i] <- as.factor(as.character(mapping[d[,i]]))  	# Note: 2
}
}
summary(d$Purpose)
table(d$Purpose,d$Good.Loan)
2^15
2^14
?axis
?png
?log
log(0)
double.xmin
10-e10
e-10
10^-10
head(mtcars)
str(mtcars)
mtcars$mpg
hist(mtcars$mpg)
d <- density(mtcars$mpg) # returns the density data
plot(d)
?density
library(ggplot2)
ggplot(data=chol, aes(chol$AGE)) + geom_histogram()
ggplot(data=mtcars, aes(mtcars$mpg)) + geom_histogram()
r()
q()
1/5
##[1] 0.2
3/5-2/5
##[1] 0.2
1/5==3/5-2/5
##[1] FALSE
sprintf("%.20f",1/5)
##[1] "0.20000000000000001110”
sprintf("%.20f",3/5-2/5)
##[1] "0.19999999999999995559"
all.equal(1/5,3/5-2/5)
##[1] TRUE
3/5
1/5
##[1] 0.2
3/5-2/5
##[1] 0.2
1/5==3/5-2/5
##[1] FALSE
sprintf("%.20f",1/5)
##[1] "0.20000000000000001110”
sprintf("%.20f",3/5-2/5)
##[1] "0.19999999999999995559"
all.equal(1/5,3/5-2/5)
##[1] TRUE
?plot
x<-1:10
plot(x,x)
1/5
##[1] 0.2
3/5-2/5
##[1] 0.2
1/5==3/5-2/5
##[1] FALSE
sprintf("%.20f",1/5)
##[1] "0.20000000000000001110”
sprintf("%.20f",3/5-2/5)
##[1] "0.19999999999999995559"
all.equal(1/5,3/5-2/5)
##[1] TRUE
1/5
3/5-2/5
1/5==3/5-2/5
sprintf("%.20f",1/5)
sprintf("%.20f",3/5-2/5)
all.equal(1/5,3/5-2/5)
setwd("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/1052/codes/codes14")
setwd("~/Dropbox/13_NCCU/courses/DataScienceInPractice_資料科學實務/1052/codes/codes14/CDC")
# example 7.7 of section 7.2.1
# (example 7.7 of section 7.2.1)  : Linear and logistic regression : Using logistic regression : Understanding logistic regression
# Title: Loading the CDC data
load("NatalRiskData.rData")
train <- sdata[sdata$ORIGRANDGROUP<=5,]
test <- sdata[sdata$ORIGRANDGROUP>5,]
# example 7.8 of section 7.2.2
# (example 7.8 of section 7.2.2)  : Linear and logistic regression : Using logistic regression : Building a logistic regression model
# Title: Building the model formula
complications <- c("ULD_MECO","ULD_PRECIP","ULD_BREECH")
riskfactors <- c("URF_DIAB", "URF_CHYPER", "URF_PHYPER", "URF_ECLAM")
y <- "atRisk"
x <- c("PWGT",
"UPREVIS",
"CIG_REC",
"GESTREC3",
"DPLURAL",
complications,
riskfactors)
fmla <- paste(y, paste(x, collapse="+"), sep="~")
# example 7.9 of section 7.2.2
# (example 7.9 of section 7.2.2)  : Linear and logistic regression : Using logistic regression : Building a logistic regression model
# Title: Fitting the logistic regression model
print(fmla)
## [1] "atRisk ~ PWGT+UPREVIS+CIG_REC+GESTREC3+DPLURAL+ULD_MECO+ULD_PRECIP+
##                    ULD_BREECH+URF_DIAB+URF_CHYPER+URF_PHYPER+URF_ECLAM"
model <- glm(fmla, data=train, family=binomial(link="logit"))
# example 7.10 of section 7.2.3
# (example 7.10 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions
# Title: Applying the logistic regression model
train$pred <- predict(model, newdata=train, type="response")
test$pred <- predict(model, newdata=test, type="response")
# example 7.11 of section 7.2.3
# (example 7.11 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions
# Title: Plotting distribution of prediction score grouped by known outcome
library('ggplot2')
ggplot(train, aes(x=pred, color=atRisk, linetype=atRisk)) +
geom_density()
# example 7.12 of section 7.2.3
# (example 7.12 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions
# Title: Exploring modeling trade-offs
library(ROCR)                                      	# Note: 1
library(grid)                                      	# Note: 2
predObj <- prediction(train$pred, train$atRisk)     	# Note: 3
precObj <- performance(predObj, measure="prec")     	# Note: 4
recObj <- performance(predObj, measure="rec")       	# Note: 5
precision <- (precObj@y.values)[[1]]                	# Note: 6
prec.x <- (precObj@x.values)[[1]]                   	# Note: 7
recall <- (recObj@y.values)[[1]]
rocFrame <- data.frame(threshold=prec.x, precision=precision,
recall=recall)               	# Note: 8
pnull <- mean(as.numeric(train$atRisk))             	# Note: 10
p1 <- ggplot(rocFrame, aes(x=threshold)) +          	# Note: 11
geom_line(aes(y=precision/pnull)) +
coord_cartesian(xlim = c(0,0.05), ylim=c(0,10) )
p2 <- ggplot(rocFrame, aes(x=threshold)) +          	# Note: 12
geom_line(aes(y=recall)) +
coord_cartesian(xlim = c(0,0.05) )
nplot <- function(plist) {                          	# Note: 9
n <- length(plist)
grid.newpage()
pushViewport(viewport(layout=grid.layout(n,1)))
vplayout=function(x,y) {viewport(layout.pos.row=x, layout.pos.col=y)}
for(i in 1:n) {
print(plist[[i]], vp=vplayout(i,1))
}
}
nplot(list(p1, p2))                                	# Note: 13
# Note 1:
#   Load ROCR library.
# Note 2:
#   Load grid library (you’ll need this for the
#   nplot function below).
# Note 3:
#   Create ROCR prediction object.
# Note 4:
#   Create ROCR object to calculate precision as
#   a function of threshold.
# Note 5:
#   Create ROCR object to calculate recall as a
#   function of threshold.
# Note 6:
#   at ( @ ) symbol@ (at) symbolROCR objects are what R calls S4 objects;
#   the slots (or fields) of an S4 object are stored
#   as lists within the object. You extract the slots
#   from an S4 object using @ notation.
# Note 7:
#   The x values (thresholds) are the same in
#   both predObj and recObj, so you only need to
#   extract them once.
# Note 8:
#   Build data frame with thresholds, precision,
#   and recall.
# Note 9:
#   Function to plot multiple plots on one page
#   (stacked).
# Note 10:
#   Calculate rate of at-risk births in the
#   training set.
# Note 11:
#   Plot enrichment rate as a function of
#   threshold.
# Note 12:
#   Plot recall as a function of
#   threshold.
# Note 13:
#   Show both plots simultaneously.
# example 7.13 of section 7.2.3
# (example 7.13 of section 7.2.3)  : Linear and logistic regression : Using logistic regression : Making predictions
# Title: Evaluating our chosen model
ctab.test <- table(pred=test$pred>0.02, atRisk=test$atRisk) 	# Note: 1
ctab.test                                                      	# Note: 2
##        atRisk
## pred    FALSE TRUE
##   FALSE  9487   93
##   TRUE   2405  116
precision <- ctab.test[2,2]/sum(ctab.test[2,])
precision
## [1] 0.04601349
recall <- ctab.test[2,2]/sum(ctab.test[,2])
recall
## [1] 0.5550239
enrich <- precision/mean(as.numeric(test$atRisk))
enrich
## [1] 2.664159
# Note 1:
#   Build confusion matrix.
# Note 2:
#   Rows contain predicted negatives and
#   positives; columns contain actual negatives and
#   positives.
# example 7.14 of section 7.2.4
# (example 7.14 of section 7.2.4)  : Linear and logistic regression : Using logistic regression : Finding relations and extracting advice from logistic models
# Title: The model coefficients
coefficients(model)
summary(model)
pred <- predict(model, newdata=train, type="response")
llcomponents <- function(y, py) {
y*log(py) + (1-y)*log(1-py)
}
edev <- sign(as.numeric(train$atRisk)-pred)*sqrt(-2*llcomponents(as.numeric(train$atRisk), pred))
summary(edev)
set.seed(602957)
x <- rnorm(1000)
noise <- rnorm(1000, sd=1.5)
y <- 3*sin(2*x) + cos(0.75*x) - 1.5*(x^2 ) + noise
select <- runif(1000)
frame <- data.frame(y=y, x = x)
train <- frame[select > 0.1,]
test <-frame[select <= 0.1,]
lin.model <- lm(y ~ x, data=train)
summary(lin.model)
#calculate the root mean squared error (rmse)
resid.lin <- train$y-predict(lin.model)
sqrt(mean(resid.lin^2))
library(mgcv)
glin.model <- gam(y~s(x), data=train)
# y ~ x => the same wi
glin.model$converged
summary(glin.model)
resid.glin <- train$y-predict(glin.model)
sqrt(mean(resid.glin^2))
tual <- test$y
pred.lin <- predict(lin.model, newdata=test)
pred.glin <- predict(glin.model, newdata=test)
resid.lin <- actual-pred.lin
resid.glin <- actual-pred.glin
# Compare the RMSE
sqrt(mean(resid.lin^2))
sqrt(mean(resid.glin^2))
# Compare the R-squared
cor(actual, pred.lin)^2
cor(actual, pred.glin)^2
atual <- test$y
pred.lin <- predict(lin.model, newdata=test)
pred.glin <- predict(glin.model, newdata=test)
resid.lin <- actual-pred.lin
resid.glin <- actual-pred.glin
# Compare the RMSE
sqrt(mean(resid.lin^2))
sqrt(mean(resid.glin^2))
# Compare the R-squared
cor(actual, pred.lin)^2
cor(actual, pred.glin)^2
actual <- test$y
pred.lin <- predict(lin.model, newdata=test)
pred.glin <- predict(glin.model, newdata=test)
resid.lin <- actual-pred.lin
resid.glin <- actual-pred.glin
# Compare the RMSE
sqrt(mean(resid.lin^2))
sqrt(mean(resid.glin^2))
# Compare the R-squared
cor(actual, pred.lin)^2
cor(actual, pred.glin)^2
plot(glin.model)
setwd("../../codes16/newBornWei/")
library(mgcv)
library(ggplot2)
load("NatalBirthData.rData")
train <- sdata[sdata$ORIGRANDGROUP<=5,]
test <- sdata[sdata$ORIGRANDGROUP>5,]
form.lin <- as.formula("DBWT ~ PWGT + WTGAIN + MAGER + UPREVIS")
linmodel <- lm(form.lin, data=train)
summary(linmodel)
form.glin <- as.formula("DBWT ~ s(PWGT) + s(WTGAIN) + s(MAGER) + s(UPREVIS)")
glinmodel <- gam(form.glin, data=train)
glinmodel$converged
summary(glinmodel)
terms <- predict(glinmodel, type="terms")
tframe <- cbind(DBWT = train$DBWT, as.data.frame(terms))
colnames(tframe) <- gsub('[()]', '', colnames(tframe))
pframe <- cbind(tframe, train[,c("PWGT", "WTGAIN", "MAGER", "UPREVIS")])
p1 <- ggplot(pframe, aes(x=PWGT)) + geom_point(aes(y=scale(sPWGT, scale=F))) + geom_smooth(aes(y=scale(DBWT, scale=F)))
terms <- predict(glinmodel, type="terms")
tframe <- cbind(DBWT = train$DBWT, as.data.frame(terms))
colnames(tframe) <- gsub('[()]', '', colnames(tframe))
pframe <- cbind(tframe, train[,c("PWGT", "WTGAIN", "MAGER", "UPREVIS")])
p1 <- ggplot(pframe, aes(x=PWGT)) + geom_point(aes(y=scale(sPWGT, scale=F))) + geom_smooth(aes(y=scale(DBWT, scale=F)))
p1
