통계 이야기

[5일차] Do it! 쉽게 배우는 R 데이터 분석 / 219P~ 351P / 데이터 분석 기술을 효율적으로 익히는 방법

창이 2022. 7. 3.
728x90
반응형

안녕하세요.

일을 하느라고, 주말을 이용해서 이제야 공부를 다시 해봅니다.

 

진행하는 데에 아무런 문제는 없었지만,

텍스트 마이닝 부분에서, KoNLP 패키지 관련된 이슈가 좀 힘들었습니다.

R 네임스페이스 오류로 계속 안되더라구요. 이 이슈때문에 지도 시각화 일부분도 조금 애를 먹었던 것 같습니다. 분명 4버전 전에는 잘 되었던 코드들인데도 불구하고 , 업그레이드 되면서 새롭게 찾아봐야 하는 이슈들이 있는 것 같더라구요.

 

밑 코드 블럭을 활용하여 do it R 데이터분석 책에 나타나있는 코드를 활용하여 공부했습니다,

코드를 하나하나 뜯어봤고 

다른 것에 적용시키면서 오늘 하루를 보냈던 것 같습니다.

 

T검정이나, 상관성 분석 등은통계학적 이론을 먼저 알고나니, 코드로 구현하는 것은쉽게 구현할 수 있었던 것 같습니다.

 

마크다운 또한 이전에 해봤던 경력이 있어 쉽게 접근할 수 있었습니다.

# 데이터 분석하기
# 패키지 설치 및 로드
library(foreign)
library(readxl)
raw_welfare <- read.spss(file = "Koweps_hpc10_2015_beta1.sav", to.data.frame =T)
welfare <- raw_welfare
head(welfare)
welfare <- rename(welfare,
                  sex = h10_g3,
                  birth = h10_g4,
                  marriage = h10_g10,
                  religion = h10_g11,
                  income = p1002_8aq1,
                  code_jon = h10_eco9,
                  code_region = h10_reg7)
class(welfare$sex)
table(welfare$sex)

# 이상치, 결측 처리
welfare$sex <- ifelse(welfare$sex ==9, NA, welfare$sex)
table(is.na(welfare$sex))
welfare$sex <- ifelse(welfare$sex == 1, "male", "female")
table(welfare$sex)
qplot(welfare$sex)

summary(welfare$income)
qplot(welfare$income) + xlim(0, 1000)
welfare$income <- ifelse(welfare$income %in% c(0, 9999), NA, welfare$income)
table(is.na(welfare$income))

sex_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(sex) %>%
  summarise(mean_income = mean(income))
sex_income
ggplot(data = sex_income, aes(x = sex, y= mean_income)) + geom_col()


summary(welfare$birth)
table(is.na(welfare$birth))
welfare$birth <- ifelse(welfare$birth == 9999, NA, welfare$birth)
table(is.na(welfare$birth))

welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)
qplot(welfare$age)


age_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(age) %>%
  summarise(mean_income = mean(income))
head(age_income)
ggplot(data = age_income, aes(x = age, y= mean_income)) + geom_line()


welfare <- welfare %>%
  mutate(ageg = ifelse(age < 30, "young", ifelse(age <= 59, "middle", "old")))

table(welfare$ageg)

ageg_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(ageg) %>%
  summarise(mean_income = mean(income))
ageg_income
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) +
  geom_col() + 
  scale_x_discrete(limits = c("young", "middle", "old"))


# 변수 검토하기
class(welfare$birth)
summary(welfare$birth)
table(is.na(welfare$birth))

# 나이와 월급의 관계
class(welfare$birth)
summary(welfare$birth)
library(ggplot2)
qplot(welfare$birth)

# 결측히 확인 
table(is.na(welfare$birth))

welfare$birth <- ifelse(welfare$birth == 9999, NA, welfare$birth)
table(is.na(welfare$birth))
welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)
qplot(welfare$age)

# 나이와 월급의 관계 분석
library(dplyr)
age_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(age) %>%
  summarise(mean_income= mean(income))
head(age_income)

ggplot(data =age_income, aes(x = age, y = mean_income)) + geom_line()

# 연령대에 따른 월급 차이 
welfare <- welfare %>%
  mutate(ageg = ifelse(age < 30, "young", ifelse(age <= 59, "middle", "old")))
table(welfare$ageg)

ageg_income <- welfare %>%
  filter(!is.na(income))%>%
  group_by(ageg) %>%
  summarise(mean_income = mean(income))
ageg_income

ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col()

ggplot(data = ageg_income, aes(x = ageg, y= mean_income)) + geom_col() + scale_x_discrete(limits = c("young", "middle", "old"))

#연령대 및 성별 월급 차이 
sex_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(ageg, sex) %>%
  summarise(mean_income = mean(income))
sex_income

ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) + geom_col() + scale_x_discrete(limits = c("young","middle", "old"))

ggplot(data = sex_income, aes(x = ageg, y= mean_income, fill = sex)) + geom_col(position = "dodge") + scale_x_discrete(limits = c("young", "middle", "old"))

#성별 연령별 월급 평균표 만들기
sex_age <- welfare%>%
  filter(!is.na(income)) %>%
  group_by(age, sex) %>%
  summarise(meam_income = mean(income))
head(sex_age)
ggplot(data =sex_age, aes(x = age, y= mean_income, col = sex)) + geom_line()

# 종교 유무에 따른 이혼율
class(welfare$religion)
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)
table(welfare$marriage)
welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage", ifelse(welfare$marriage == 3, "divorce", NA))
table(welfare$group_marriage)
table(is.na(welfare$group_marriage))
religion_marriage <- welfare %>%
  filter(!is.na(group_marriage))%>%
  group_by(religion, group_marriage) %>%
  summarise(n = n())%>%
  mutate(tot_group = sum(n)) %>%
  mutate(pct = round(n/tot_group*100, 1))
#이혼추출
divorce <- religion_marriage %>%
  filter(group_marriage == "divorce") %>%
  select(religion, pct)
divorce
ggplot(data = divorce, aes(x = religion, y= pct)) + geom_col()

ageg_marriage <- welfare %>%
  filter(!is.na(group_marriage))%>%
  group_by(ageg, group_marriage) %>%
  summarise(n =n()) %>%
  mutate(tot_group = sum(n)) %>%
  mutate(pct = round(n/tot_group*100, 1))
ageg_marriage

ageg_divorce <- ageg_marriage %>%
  filter(ageg != "young" & group_marriage == "divorce") %>%
  select(ageg, pct)
ageg_religion_marriage <- welfare %>%
  filter(!is.na(group_marriage) & ageg != "young") %>%
  group_by(ageg, religion, group_marriage) %>%
  summarise(n = n()) %>%
  mutate(tot_group =sum(n)) %>%
  mutate(pct = round(n/tot_group*100, 1))

df_divorce <- ageg_religion_marriage %>%
  filter(group_marriage == "divorce") %>%
  select(ageg, religion, pct)
df_divorce

#노년층이 많은 지역
class(welfare$code_region)
list_region <- data.frame(code_region = c(1:7),regrion = c("서울", "수도권(인천/경기)", "부산/경남/울산", "대구/경북", "대전/충남", "강원/충북", "광주/전남/전북/제주도"))
welfare <- left_join(welfare, list_region, id = "code_region")
welfare %>%
  select(code_region, region) %>%
  head

region_ageg <- welfare %>%
  group_by(region, ageg) %>%
  summarise(n = n()) %>%
  mutate(tot_group = sum(n)) %>%
  mutate(pct = round(n/tot_group * 100, 2))
region_ageg

#텍스트마이닝
install.packages("rJava")
install.packages("memoise")
install.packages("KoNLP")
install.packages("multilinguer")

library(multilinguer)
multilinguer::install_jdk()
remotes::install_github('haven-jeon/KoNLP', upgrade = "never", INSTALL_opts = c("--no-multiarch"))

#지도 
install.packages("ggiraphExtra")
library(ggiraphExtra)
head(USArrests)

library(tibble)
crime <- rownames_to_column(USArrests, var = "state")
crime$state <- tolower(crime$state)
library(ggplot2)
states_map <- map_data("state")
ggChoropleth(data = crime, aes(fill = Murder, map_id = state), map = states_map)
ggChoropleth(data = crime, aes(fill = Murder, map_id = state), map = states_map, interactive = T)
install.packages("stringi")
install.packages("devtools")
devtools::install_github("cardiomoon/kormaps2014")
install.packages("caesar", repos = "https://cran.microsoft.com/snapshot/2021-07-16/")
devtools::install_github("klutometis/roxygen")
remove.packages("caesar")

# 인터랙티브 그래프
install.packages("plotly")
library(plotly)
p <- ggplot(data = mpg, aes(x = displ, y = hwy, col = drv))+geom_point()
ggplotly(p)

#dygraphs 패키지
install.packages("dygraphs")
library(dygraphs)
economics <- ggplot2::economics
head(economics)
library(xts)
eco <- xts(economics$unemploy, order.by = economics$date)
dygraph(eco)

dygraph(eco) %>% dyRangeSelector()

#저축률
eco_a <- xts(economics$psavert, order.by = economics$date)
eco_b <- xts(economics$unemploy/1000, order.by = economics$date)
eco2 <- cbind(eco_a, eco_b)
colnames(eco2) <- c("psavert", "unemploy")
dygraph(eco2) %>% dyRangeSelector()

# t검정
mpg <- as.data.frame(ggplot2::mpg)
mpg_diff <- mpg%>%
  select(class, cty) %>%
  filter(class %in% c("compact", "suv"))
head(mpg_diff)3
t.test(data = mpg_diff, cty ~class, var.equal = T)

# 관계성 분석
economics <- as.data.frame(ggplot2::economics)
cor.test(economics$unemploy, economics$pce)
install.packages("corrplot")
library(corrplot)
head(mtcars)
car_cor <- cor(mtcars)
round(car_cor, 2)
corrplot(car_cor)


# r 내장함수 사용
exam <- read.csv("csv_exam.csv")
exam[]
exam[1,]
exam[2,]

exam[exam$class == 1,]

exam[exam$class == 1 & exam$math >= 50, ]

exam[,1]

exam[, "class"]

R 마크다운입니다.

728x90
반응형

댓글

추천 글