통계 이야기
[5일차] Do it! 쉽게 배우는 R 데이터 분석 / 219P~ 351P / 데이터 분석 기술을 효율적으로 익히는 방법
728x90
반응형
안녕하세요.
일을 하느라고, 주말을 이용해서 이제야 공부를 다시 해봅니다.
진행하는 데에 아무런 문제는 없었지만,
텍스트 마이닝 부분에서, KoNLP 패키지 관련된 이슈가 좀 힘들었습니다.
R 네임스페이스 오류로 계속 안되더라구요. 이 이슈때문에 지도 시각화 일부분도 조금 애를 먹었던 것 같습니다. 분명 4버전 전에는 잘 되었던 코드들인데도 불구하고 , 업그레이드 되면서 새롭게 찾아봐야 하는 이슈들이 있는 것 같더라구요.
밑 코드 블럭을 활용하여 do it R 데이터분석 책에 나타나있는 코드를 활용하여 공부했습니다,
코드를 하나하나 뜯어봤고
다른 것에 적용시키면서 오늘 하루를 보냈던 것 같습니다.
T검정이나, 상관성 분석 등은통계학적 이론을 먼저 알고나니, 코드로 구현하는 것은쉽게 구현할 수 있었던 것 같습니다.
마크다운 또한 이전에 해봤던 경력이 있어 쉽게 접근할 수 있었습니다.
# 데이터 분석하기
# 패키지 설치 및 로드
library(foreign)
library(readxl)
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame =T)
welfare <- raw_welfare
head(welfare)
welfare <- rename(welfare,
sex = h10_g3,
birth = h10_g4,
marriage = h10_g10,
religion = h10_g11,
income = p1002_8aq1,
code_jon = h10_eco9,
code_region = h10_reg7)
class(welfare$sex)
table(welfare$sex)
# 이상치, 결측 처리
welfare$sex <- ifelse(welfare$sex ==9, NA, welfare$sex)
table(is.na(welfare$sex))
welfare$sex <- ifelse(welfare$sex == 1, "male", "female")
table(welfare$sex)
qplot(welfare$sex)
summary(welfare$income)
qplot(welfare$income) + xlim(0, 1000)
welfare$income <- ifelse(welfare$income %in% c(0, 9999), NA, welfare$income)
table(is.na(welfare$income))
sex_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(sex) %>%
summarise(mean_income = mean(income))
sex_income
ggplot(data = sex_income, aes(x = sex, y= mean_income)) + geom_col()
summary(welfare$birth)
table(is.na(welfare$birth))
welfare$birth <- ifelse(welfare$birth == 9999, NA, welfare$birth)
table(is.na(welfare$birth))
welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)
qplot(welfare$age)
age_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(age) %>%
summarise(mean_income = mean(income))
head(age_income)
ggplot(data = age_income, aes(x = age, y= mean_income)) + geom_line()
welfare <- welfare %>%
mutate(ageg = ifelse(age < 30, "young", ifelse(age <= 59, "middle", "old")))
table(welfare$ageg)
ageg_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(ageg) %>%
summarise(mean_income = mean(income))
ageg_income
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) +
geom_col() +
scale_x_discrete(limits = c("young", "middle", "old"))
# 변수 검토하기
class(welfare$birth)
summary(welfare$birth)
table(is.na(welfare$birth))
# 나이와 월급의 관계
class(welfare$birth)
summary(welfare$birth)
library(ggplot2)
qplot(welfare$birth)
# 결측히 확인
table(is.na(welfare$birth))
welfare$birth <- ifelse(welfare$birth == 9999, NA, welfare$birth)
table(is.na(welfare$birth))
welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)
qplot(welfare$age)
# 나이와 월급의 관계 분석
library(dplyr)
age_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(age) %>%
summarise(mean_income= mean(income))
head(age_income)
ggplot(data =age_income, aes(x = age, y = mean_income)) + geom_line()
# 연령대에 따른 월급 차이
welfare <- welfare %>%
mutate(ageg = ifelse(age < 30, "young", ifelse(age <= 59, "middle", "old")))
table(welfare$ageg)
ageg_income <- welfare %>%
filter(!is.na(income))%>%
group_by(ageg) %>%
summarise(mean_income = mean(income))
ageg_income
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col()
ggplot(data = ageg_income, aes(x = ageg, y= mean_income)) + geom_col() + scale_x_discrete(limits = c("young", "middle", "old"))
#연령대 및 성별 월급 차이
sex_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(ageg, sex) %>%
summarise(mean_income = mean(income))
sex_income
ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col() + scale_x_discrete(limits = c("young","middle", "old"))
ggplot(data = sex_income, aes(x = ageg, y= mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limits = c("young", "middle", "old"))
#성별 연령별 월급 평균표 만들기
sex_age <- welfare%>%
filter(!is.na(income)) %>%
group_by(age, sex) %>%
summarise(meam_income = mean(income))
head(sex_age)
ggplot(data =sex_age, aes(x = age, y= mean_income, col = sex)) + geom_line()
# 종교 유무에 따른 이혼율
class(welfare$religion)
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)
table(welfare$marriage)
welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage", ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)
table(is.na(welfare$group_marriage))
religion_marriage <- welfare %>%
filter(!is.na(group_marriage))%>%
group_by(religion, group_marriage) %>%
summarise(n = n())%>%
mutate(tot_group = sum(n)) %>%
mutate(pct = round(n/tot_group*100, 1))
#이혼추출
divorce <- religion_marriage %>%
filter(group_marriage == "divorce") %>%
select(religion, pct)
divorce
ggplot(data = divorce, aes(x = religion, y= pct)) + geom_col()
ageg_marriage <- welfare %>%
filter(!is.na(group_marriage))%>%
group_by(ageg, group_marriage) %>%
summarise(n =n()) %>%
mutate(tot_group = sum(n)) %>%
mutate(pct = round(n/tot_group*100, 1))
ageg_marriage
ageg_divorce <- ageg_marriage %>%
filter(ageg != "young" & group_marriage == "divorce") %>%
select(ageg, pct)
ageg_religion_marriage <- welfare %>%
filter(!is.na(group_marriage) & ageg != "young") %>%
group_by(ageg, religion, group_marriage) %>%
summarise(n = n()) %>%
mutate(tot_group =sum(n)) %>%
mutate(pct = round(n/tot_group*100, 1))
df_divorce <- ageg_religion_marriage %>%
filter(group_marriage == "divorce") %>%
select(ageg, religion, pct)
df_divorce
#노년층이 많은 지역
class(welfare$code_region)
list_region <- data.frame(code_region = c(1:7),regrion = c("서울", "수도권(인천/경기)", "부산/경남/울산", "대구/경북", "대전/충남", "강원/충북", "광주/전남/전북/제주도"))
welfare <- left_join(welfare, list_region, id = "code_region")
welfare %>%
select(code_region, region) %>%
head
region_ageg <- welfare %>%
group_by(region, ageg) %>%
summarise(n = n()) %>%
mutate(tot_group = sum(n)) %>%
mutate(pct = round(n/tot_group * 100, 2))
region_ageg
#텍스트마이닝
install.packages("rJava")
install.packages("memoise")
install.packages("KoNLP")
install.packages("multilinguer")
library(multilinguer)
multilinguer::install_jdk()
remotes::install_github('haven-jeon/KoNLP', upgrade = "never", INSTALL_opts = c("--no-multiarch"))
#지도
install.packages("ggiraphExtra")
library(ggiraphExtra)
head(USArrests)
library(tibble)
crime <- rownames_to_column(USArrests, var = "state")
crime$state <- tolower(crime$state)
library(ggplot2)
states_map <- map_data("state")
ggChoropleth(data = crime, aes(fill = Murder, map_id = state), map = states_map)
ggChoropleth(data = crime, aes(fill = Murder, map_id = state), map = states_map, interactive = T)
install.packages("stringi")
install.packages("devtools")
devtools::install_github("cardiomoon/kormaps2014")
install.packages("caesar", repos = "https://cran.microsoft.com/snapshot/2021-07-16/")
devtools::install_github("klutometis/roxygen")
remove.packages("caesar")
# 인터랙티브 그래프
install.packages("plotly")
library(plotly)
p <- ggplot(data = mpg, aes(x = displ, y = hwy, col = drv))+geom_point()
ggplotly(p)
#dygraphs 패키지
install.packages("dygraphs")
library(dygraphs)
economics <- ggplot2::economics
head(economics)
library(xts)
eco <- xts(economics$unemploy, order.by = economics$date)
dygraph(eco)
dygraph(eco) %>% dyRangeSelector()
#저축률
eco_a <- xts(economics$psavert, order.by = economics$date)
eco_b <- xts(economics$unemploy/1000, order.by = economics$date)
eco2 <- cbind(eco_a, eco_b)
colnames(eco2) <- c("psavert", "unemploy")
dygraph(eco2) %>% dyRangeSelector()
# t검정
mpg <- as.data.frame(ggplot2::mpg)
mpg_diff <- mpg%>%
select(class, cty) %>%
filter(class %in% c("compact", "suv"))
head(mpg_diff)3
t.test(data = mpg_diff, cty ~class, var.equal = T)
# 관계성 분석
economics <- as.data.frame(ggplot2::economics)
cor.test(economics$unemploy, economics$pce)
install.packages("corrplot")
library(corrplot)
head(mtcars)
car_cor <- cor(mtcars)
round(car_cor, 2)
corrplot(car_cor)
# r 내장함수 사용
exam <- read.csv("csv_exam.csv")
exam[]
exam[1,]
exam[2,]
exam[exam$class == 1,]
exam[exam$class == 1 & exam$math >= 50, ]
exam[,1]
exam[, "class"]
R 마크다운입니다.
728x90
반응형
댓글