dm <- diamonds
 qplot(data = dm, x = carat, y = price,
       xlim = c(0, quantile(dm$carat, 0.99)),
       ylim = c(0, quantile(dm$price, 0.99)))+
   geom_point(fill = I('#F79420'), color = I('black'), shape = 21)
ggplot(dm, aes(x = carat, y = price)) +
   scale_x_continuous(lim = c(0, quantile(dm$carat, 0.99))) +
   scale_y_continuous(lim = c(0, quantile(dm$price, 0.99))) +
   geom_point(fill = I('#F79420'), color = I('black'), shape = 21)

使用统计平滑曲线
ggplot(dm, aes(x = carat, y = price)) +
   geom_point(color = I('#F79420'),alpha= 1/4 ) +
   stat_smooth(method = 'lm')
   scale_x_continuous(lim = c(0, quantile(dm$carat, 0.99))) +
   scale_y_continuous(lim = c(0, quantile(dm$price, 0.99))) 

  install.packages('GGally')
   install.packages('scales')
   install.packages('memisc')
   install.packages('lattice')
   install.packages('MASS')
   install.packages('car')
   install.packages('reshape')
   install.packages('plyr')
   
   library(GGally)
   library(scales)
   library(memisc)
   library(lattice)
   library(MASS)
   library(car)
   library(reshape)
   library(plyr)
set.seed(20022012)
 diamonds_samp <- dm[sample(1:length(dm$price), 10000 ) ,]
ggpairs(diamonds_samp, 
         lower = list(continuous = wrap("points", shape = I('.'))), 
         upper = list(combo = wrap("box", outlier.shape = I('.'))))

library(gridExtra)
 library(grid)
 plot1 <- qplot(data = dm,x = price,binwidth = 100,
                fill = I('#099DD9')) + 
   ggtitle('Price')
 plot2 <- qplot(data = dm,x = price,binwidth = 0.01,
                fill = I('#F79420')) +
   ggtitle('Price (log10)') +
   scale_x_log10()
grid.arrange(plot1,plot2,ncol = 2)

qplot(carat,price,data=dm)+
   scale_y_continuous(trans = log10_trans())+
   ggtitle('Price (log10) by Carat')

cuberoot_trans = function() trans_new('cuberoot',
                       transform = function(x) x^(1/3),
                       inverse = function(x) x^3)
ggplot(aes(carat,price), data = dm) +
   geom_point() +
   scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
                      breaks = c(0.2,0.5,1,2,3)) +
   scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
                      breaks = c(350,1000,5000,10000,15000)) +
   ggtitle('Price (log10) by Cube-Root of Carat')

head(sort(table(dm$carat),decreasing = T))
 head(sort(table(dm$price),decreasing = T))
  
> head(sort(table(dm$price),decreasing = T))
605 802 625 828 776 698 
 132 127 126 125 124 121 
 > head(sort(table(dm$carat),decreasing = T))
 0.3 0.31 1.01  0.7 0.32    1 
 2604 2249 2242 1981 1840 1558
函数测试
set.seed(100)
 d <- rpois(25,8)
 GetMeanAndSE <- function(x) {
   m <- mean(x)
   n <- length(x)
   SE <- sd(x) / sqrt(n)
   return(c(m, SE))
 }
 GetMeanAndSE(d)
ggplot(aes(carat,price), data = dm) +
   geom_point(alpha = 0.5, size = 0.75, position = 'jitter') +
   scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
                      breaks = c(0.2,0.5,1,2,3)) +
   scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
                      breaks = c(350,1000,5000,10000,15000)) +
   ggtitle('Price (log10) by Cube-Root of Carat')

library(RColorBrewer)
 ggplot(aes(carat,price,colour= clarity), data = dm) +
   geom_point(alpha = 0.5, size = 0.75, position = 'jitter') +
   scale_color_brewer( type= 'div',
     guide = guide_legend(title = 'Clarity', reverse = TRUE,
               override.aes = list(alpha = 1 ,size =2))) +
   scale_x_continuous(trans=cuberoot_trans(),limits= c(0.2,3),
                      breaks = c(0.2,0.5,1,2,3)) +
   scale_y_continuous(trans=log10_trans(),limits= c(350,15000),
                      breaks = c(350,1000,5000,10000,15000)) +
   ggtitle('Price (log10) by Cube-Root of Carat')

ggplot(aes(x = carat, y = price, color = cut), data = diamonds) + 
   geom_point(alpha = 0.5, size = 1, position = 'jitter') +
   scale_color_brewer(type = 'div',
                      guide = guide_legend(title = 'Clarity', reverse = T,
                                           override.aes = list(alpha = 1, size = 2))) +  
   scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                      breaks = c(0.2, 0.5, 1, 2, 3)) + 
   scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                      breaks = c(350, 1000, 5000, 10000, 15000)) +
   ggtitle('Price (log10) by Cube-Root of Carat and Cut')

ggplot(aes(x = carat, y = price, color = color), data = diamonds) + 
   geom_point(alpha = 0.5, size = 1, position = 'jitter') +
   scale_color_brewer(type = 'div',
         guide = guide_legend(title = 'Color', reverse = F,
               override.aes = list(alpha = 1, size = 2))) +  
   scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                      breaks = c(0.2, 0.5, 1, 2, 3)) + 
   scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                      breaks = c(350, 1000, 5000, 10000, 15000)) +
 ggtitle('Price (log10) by Cube-Root of Carat and Color')

构建线性模型
参考
https://data.princeton.edu/r/linearmodels
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data=dm)
 m2 <- update(m1, ~ . + carat)
 m3 <- update(m2, ~ . + cut)
 m4 <- update(m3, ~ . + color)
 m5 <- update(m4, ~ . + clarity)
 mtable(m1,m2,m3,m4,m5)
获取更多数据集
从 https://github.com/solomonm/diamonds-data 下载数据集。点击 BigDiamonds.Rda 链接,然后点击“原始数据”按钮开始下载。下载完成后,你就可以通过命令 load("BigDiamonds.rda") 加载数据
install.packages('RCurl')
 install.packages('bitops')
 library(RCurl)
 library(bitops)
 diamondsurl= getBinaryURL("https://raw.github.com/SolomonMg/diamonds-data/blob/master/BigDiamonds.Rda")
 load(rawConnection(diamondsurl))
load("BigDiamonds.rda")    #如果手动下载
 dmb <- diamondsbig
 dmb$logprice = log(dmb$price)
 m1 <- lm(logprice ~ I(carat^(1/3)),
     data=dmb[dmb$price < 10000 &
                dmb$cert == "GIA",])
 m2 <- update(m1, ~ . + carat)
 m3 <- update(m2, ~ . + cut)
 m4 <- update(m3, ~ . + color)
 m5 <- update(m4, ~ . + clarity)
 suppressMessages(library(lattice))
 suppressMessages(library(MASS))
 suppressMessages(library(memisc))
 mtable(m1, m2, m3, m4, m5)
 models <- mtable(m1, m2, m3, m4, m5)
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
                          color = "I", clarity="VS1")
 modelEstimate =predict(m5,newdata = thisDiamond,
                        interval = "prediction", level = .95)
 exp(modelEstimate)
dat = data.frame(m4$model, m4$residuals)
with(dat, sd(m4.residuals))
with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals))
dat$resid <- as.numeric(dat$m4.residuals)
 ggplot(aes(y = resid, x = round(carat, 2)), data = dat) + 
   geom_line(stat = "summary", fun.y = sd) 











