PPV课数据科学社区数据挖掘常用模型构建示例（R语言版）

数据挖掘常用模型构建示例（R语言版）

Linear Regression

library(MASS) lm_fit = lm(medv~poly(rm,2)+crim,data = Boston) # 构建线性模型 summary(lm_fit) # 检查线性模型

Ridge Regreesion and Lasso

# 岭回归与lasso回归跟其他模型不同，不能直接以公式的形式把数据框直接扔进去，也不支持subset；所以数据整理工作要自己做 library(glmnet) library(ISLR) Hitters = na.omit(Hitters) x = model.matrix(Salary~., Hitters)[,-1] # 构建回归设计矩阵 y = Hitters$Salary ridge.mod = glmnet(x,y,alpha = 0,lambda = 0.1) # 构建岭回归模型 lasso.mod = glmnet(x,y,alpha = 1,lambda = 0.1) # 构建lasso回归模型

Logistic Regression

library(ISLR) train = Smarket$Year<2005 logistic.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial, subset=train) # 构建逻辑回归模型 glm.probs = predict(glm.fit,newdata=Smarket[!train,],type="class")

K-Nearest Neighbor

library(class) library(ISLR) standardized.X=scale(Caravan[,-86]) # 先进行变量标准化 test <- 1:1000 train.X <- standardized.X[-test,] train.Y <- Caravan$Purchase[-test] test.X <- standardized.X[test,] test.Y <- Caravan$Purchase[test] knn.pred <- knn(train.X,test.X,train.Y,k=3) # 直接给出测试集预测结果

Naive Bayse

library(e1071) classifier<-naiveBayes(iris[,c(1:4)],iris[,5]) # 构建朴素贝叶斯模型 table(predict(classifier,iris[,-5]),iris[,5]) # 应用朴素贝叶斯模型预测

Decision Tree

library(tree) library(ISLR) attach(Carseats) High = ifelse(Sales <= 8 ,"No","Yes") Carseats = data.frame(Carseats,High) train = sample(1:nrow(Carseats),200) Carseats.test = Carseats[-train,] High.test = High[-train]  tree.carseats = tree(High~.-Sales,Carseats,subset=train) # 建立决策树模型 summary(tree.carseats) # 可视化决策树 plot(tree.carseats) text(tree.carseats,pretty = 0)

Random Forest

library(randomForest) library(MASS) train = sample(1:nrow(Boston),nrow(Boston)/2) boston.test = Boston[-train,] rf.boston = randomForest(medv~.,data = Boston,subset = train,mtry=6,importance=T) rf.boston summary(rf.boston)

Boosting

library(gbm) library(MASS) train = sample(1:nrow(Boston),nrow(Boston)/2) boston.test = Boston[-train,] boost.boston = gbm(medv~.,data = Boston[train,],distribution = "gaussian",n.trees=5000,interaction.depth=4) boost.boston summary(boost.boston)

Princpal Content Analysis

library(ISLR) pr.out = prcomp(USArrests,scale. = T) pr.out$rotation biplot(pr.out,scale = 0)

Apriori

library(arules) #加载arules程序包 data(Groceries) #调用数据文件 frequentsets=eclat(Groceries,parameter=list(support=0.05,maxlen=10)) #求频繁项集 inspect(frequentsets[1:10])  #察看求得的频繁项集 inspect(sort(frequentsets,by="support")[1:10])  #根据支持度对求得的频繁项集排序并察看（等价于inspect(sort(frequentsets)[1:10]） rules=apriori(Groceries,parameter=list(support=0.01,confidence=0.01))  #求关联规则 summary(rules)  #察看求得的关联规则之摘要 x=subset(rules,subset=rhs%in%"whole milk"&lift>=1.2)  #求所需要的关联规则子集 inspect(sort(x,by="support")[1:5])  #根据支持度对求得的关联规则子集排序并察看

K-means and Hierarchical Clustering

library(ISLR) nci.labels = NCI60$labs nci.data = NCI60$data sd.data = scale(nci.data) data.dist = dist(sd.data) # k-means km.out = kmeans(sd.data,4,nstart = 20) # Hierarchical Clustering  hc.out = hclust(dist(sd.data)) plot(hc.out,labels = nci.labels)

Support Vector Machine

library(e1071) library(ISLR) dat = data.frame(x = Khan$xtrain,y = as.factor(Khan$ytrain)) out = svm(y~.,data = dat, kernel = "linear", cost = 10) summary(out)

Artificial Neural Network

library(AMORE) x1 <- round(runif(2000,1,2000))     #随机生成2000个数x2 <- round(runif(2000,1,2000))      x11 <- scale(x1[1:1900])              #数据标准化，并选取1900个组作为学习集x12 <- scale(x2[1:1900])        x21 <- scale(x1[1901:2000])        #选取100组作为待测集x22 <- scale(x2[1901:2000]) y1 <- x11^2+x12^2y2 <-x21^2+x22^2p <-cbind(x11,x12)           #整合为矩阵q <-cbind(x21,x22) target = y1  net<-newff(n.neurons=c(2,2,1),learning.rate.global=1e-2,momentum.global=0.4,error.criterium="LMS", Stao=NA,hidden.layer="tansig",       output.layer="purelin",method="ADAPTgdwm") result <- train(net, p, target,error.criterium="LMS", report=TRUE, show.step=100, n.shows=5 )  z <- sim(result$net, q)     #对待测集进行预测plot(q[1:100,1],z, col="blue",pch="+")     #画出待测集模型运算后的图形points(q[1:100,1],y2,col="red", pch="x")   #画出待测集图形，并比较两者之间的差异。

作者：真依然很拉风链接：http://www.jianshu.com/p/7d32a6a9ca95

推荐：R语言数据分析-上海站

详情点击“阅读原文”

↓↓↓

原文始发于微信公众号（PPV课数据科学社区）：数据挖掘常用模型构建示例（R语言版）

原创文章，作者：ppvke，如若转载，请注明出处：http://www.ppvke.com/archives/10219

数据挖掘常用模型构建示例（R语言版）

Linear Regression

Ridge Regreesion and Lasso

Logistic Regression

K-Nearest Neighbor

Naive Bayse

Decision Tree

Random Forest

Boosting

Princpal Content Analysis

Apriori

K-means and Hierarchical Clustering

Support Vector Machine

Artificial Neural Network

联系我们

4000-51-9191

数据挖掘常用模型构建示例（R语言版）

Linear Regression

Ridge Regreesion and Lasso

Logistic Regression

K-Nearest Neighbor

Naive Bayse

Decision Tree

Random Forest

Boosting

Princpal Content Analysis

Apriori

K-means and Hierarchical Clustering

Support Vector Machine

Artificial Neural Network

相关推荐

请登录

联系我们

4000-51-9191