빅데이터 통계학 (4)

2019. 4. 5. 17:36

credit <- read.csv("credit_LN4.csv",stringsAsFactors = F)

# 각각 나눠서 점을 찍음

plot(income[default=="No"] ~ balance[default=="No"],data=credit,col="red")

# 이미 그려진 plot에 추가로 점찍는 함수 points

points(income[default=="Yes"] ~ balance[default=="Yes"],data=credit,col="blue")

# default값에 따라 왼쪽 오른쪽 나눠서 boxplot그림 ~사용

boxplot(credit$balance~credit$default)

boxplot(credit$income~credit$default)

#income변수가 default에 차이를 설명해주지 못한다.

#default를 factor로 만들어서 점찍음

credit$default <- as.factor(credit$default)

plot(default ~ balance, data = credit)

#default를 숫자로 만들어서 점찍음

credit$default1 <-ifelse(credit$default == "Yes" , 1 , 0 )

plot(default1 ~ balance, data = credit)

#inverse logit function 항상 0과 1사이의 값을 가진다.

exp(10)/(1+exp(10))

#gtools 패키지, inv.logit 사용가능

install.packages("gtools")

library(gtools)

inv.logit(10)

##########################

#MLE추정

#반드시 Y변수를 factor변수로 바꿔줘야한다.

logit1 <- glm(default ~ balance,data = credit,family = "binomial")

summary(logit1)

#beta0, beta1 을 추정했기 때문에 모든 사람의 default확률 추정 가능

#balance = 1000일 때 개인의 default pr

beta0 <- logit1$coefficients[1]

beta1 <- logit1$coefficients[2]

exp(beta0 + 1000*beta1)/(1+exp(beta0 + 1000*beta1))

inv.logit(beta0 + 1000*beta1)

#balance = 2000일 때 개인의 default pr

exp(beta0 + 2000*beta1)/(1+exp(beta0 + 2000*beta1))

inv.logit(beta0 + 2000*beta1)

#balance = 2000일 때 승산은 ? what is odds)

exp(beta0 + 2000*beta1)

inv.logit(beta0 + 2000*beta1)/(1-inv.logit(beta0 + 2000*beta1))

#x변수가 student인 경우

credit$student <- as.factor(credit$student)

logit2 <- glm(default ~ student,data = credit,family = "binomial")

summary(logit2)

#일단 beta1이 0보다 크기때문에 학생일수록 default확률이 증가한다.

#통계적으로 유의하진 않지만 추정치를 사용할때 사용한다. ??

#multiple logistic regression

logit3 <- glm(default ~ balance+ student,data = credit,family = "binomial")

summary(logit3)

# x = balance, income, student

# 이와같은 문제 시험에 나옴

# 범주형변수 factor로 바꿔주는것 중요 !

# 각각 coefficient는 다른 변수가 일정하다고 가정

logit4 <- glm(default ~ balance+ student + income ,data = credit,family = "binomial")

summary(logit4)

# 500명에 대해 모두 적용

# logit4으로 예측시키기 predict

# type 옵션의 "response"는 y의 확률을 계산하라는 의미

credit$pr_hat <- predict(logit4,newdata = credit,

type = "response")

#0.5를 기준으로 구분하기

credit$yhat <- ifelse(credit$pr_hat >0.5,"Yes","No")

table(credit$yhat) # 6명

##############################################

# 1st application : income = 15000, balance = 1000, student = Yes

# 2nd application : income = 20000, balance = 2000, student = No

# 새로 온 사람들에 대해 바로 예측 적용하기

# 먼저 두사람에 대한 데이터를 데이터 셋을 만들어야 한다.

# 시험문제 가능성 높음

# predict 이용하는 방법 잘 기억 ,factor 변수 조심

XX <- data.frame(income = c(15000,20000)

,balance = c(1000,2000)

,student = factor(c("Yes","No")))

pr_hat1 <- predict(logit4,newdata = XX,type = "response")

pr_hat1

###############################

credit <- read.csv("credit_LN4.csv",stringsAsFactors = F)

credit$default <- as.factor(credit$default)

logit4 <- glm(default ~ balance+ student + income ,data = credit,family = "binomial")

summary(logit4)

credit$pr_hat <- predict(logit4,newdata = credit,

type = "response")

credit$yhat <- ifelse(credit$pr_hat >0.5,"Yes","No")

table(credit$yhat)

#confusion matrix

table(credit$yhat, credit$default) # 정확히 맞춘 비율 488/500

# CCR (Correctly Classified Rate) 시험문제 ***

# 0.5를 기준으로 하는 것이 가장 중요한부분

488/500

# 모형의 퍼포먼스 측정

# sensitivity 민감도 5/16

# specificity 특이도 483/484

# ROC curve

# install.packages("pROC")

library(pROC)

# 시험볼때는 미리 사용데이터와 패키지 다운로드 받아놓을 것

#roccurve에는 그림을 그리기위한 점들을 찾아냄

roccurve <- roc(credit$default ~ credit$pr_hat)

plot(roccurve)

#AUC 계산

auc(roccurve) # 0.942

# logit 5 : student 와 income만 사용해서 문제풀기

logit5 <- glm(default ~ student + income ,data = credit,family = "binomial")

credit$pr_hat5 <- predict(logit5,newdata = credit,

type = "response")

roccurve2 <- roc(credit$default ~ credit$pr_hat5)

plot(roccurve2)

auc(roccurve2) # 0.5517

# 0.942 , 0.5517 을 비교할 때 logit4 모형이 더 낫다

#######################################################

# train set 과 test set의 적용

train <- credit[1:400,]

test <- credit[401:500,]

# train

logit6 <- glm(default ~ balance+ student + income ,data = train,family = "binomial")

summary(logit6)

#test set에 대한 적용

test$pr_6 <- predict(logit6,newdata = test,

type = "response")

#plot the roc curve with out-of-sample prediction

#compute the AUC with out-of-sample prediction

roccurve6 <- roc(test$default ~ test$pr_6)

plot(roccurve6)

auc(roccurve6) # 0.9439

#####################################################

저작자표시 비영리 변경금지 (새창열림)

'IT,인터넷 관련 학습 > R언어 학습' 카테고리의 다른 글

[R] 벡터 (Vector) (0)	2019.05.11
[R] R 자료형 (Data Type) (0)	2019.05.11
빅데이터 통계학 (3) (0)	2019.04.05
빅데이터 통계학(2) (0)	2019.04.05
빅데이터 통계학 (1) (0)	2019.04.05

학습러의 라이브러리

빅데이터 통계학 (4)

'IT,인터넷 관련 학습 > R언어 학습' 카테고리의 다른 글

+ Recent posts

티스토리툴바