【R言語】欠損値を数えてグラフ化
欠損値を数えてグラフ化する
データセットはBoston housingを使用する。
(参考)
https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda
rm(list = ls()) library(tidyverse) library(data.table) library(scales) library(VIM) library(corrr) library(corrplot) library(caret) library(gridExtra) library(scales) library(Rmisc) library(ggrepel) library(randomForest) library(psych) library(xgboost) path <- "C:/Users/XXXXXXX/" filename <- "train.csv" filename2 <- "test.csv" df_train <- fread(str_c(path,filename), header=T) df_test <- fread(str_c(path,filename2), header=T) test_label <- df_test$Id #train testを統合するためIDを消す df_train$Id <- NULL df_test$Id <- NULL all <- dplyr::bind_rows(df_train,df_test) # any() usage: is_na <- sapply(all, function(y) any(is.na(y))) # count na na_count <- sapply(all, function(y) sum(is.na(y))) as.data.frame(na_count) %>% rownames_to_column() %>% arrange(desc(na_count)) %>% mutate(na_prop = round(na_count / nrow(all)*100,1)) ->Na_summary Na_summary %>% dplyr::filter(na_count >1 ) %>% ggplot(aes(x=reorder(factor(rowname),na_count),y=na_count))+ geom_bar(stat="identity",fill="lightblue")+ geom_text(aes(label=na_count))+ labs(x="col name")+theme_bw()+ scale_y_continuous(expand = c(0, 0),limits = c(0,3000))+ coord_flip()