2/13/2022

Cleaning MyAnimeList data for Google Data Analytics Capstone

In the previous script I had downloaded all of the data that I wanted to use for my capstone project. Now I need to explore and clean all of it.

First things first - setting some variables

Set your working directory to wherever you’d like in the WORKINDIRECTORY section.

wd1 = "WORKINGDIRECTORY"
setwd(wd1)

Install and load any libraries

The tidyverse will be used to manipulate/transform data and the janitor library will be used to check duplicates and consistency of data.

install.packages(c("tidyverse", "janitor"))

Next, we want to load the two libraries:

library(tidyverse)
library(janitor)

Load in the data

Previously, we had downloaded and made transformations to data from the MyAnimeList API. Now we’re going to load that data into R for further analysis and cleaning.

anime_demo_table <- read.delim("temp_anime_demo_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
anime_genres_table <- read.delim("temp_anime_genres_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
anime_ranking_table <- read.delim("temp_anime_ranking_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
anime_studios_table <- read.delim("temp_anime_studios_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
anime_syn_table <- read.delim("temp_anime_syn_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
anime_table <- read.delim("temp_anime_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
demo_l <- read.delim("temp_demo_l.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
genres_l <- read.delim("temp_genres_l.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
rank_table <- read.delim("temp_rank_table.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
studios_l <- read.delim("temp_studios_l.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
tm_ky <- read.delim("temp_tm_ky.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)

Clean the data

I break cleaning and data integrity into the following steps:

Adjust any field types (e.g. character columns that need to be numeric)
Cleaning up column names
Adjusting strings to eliminate trailing, leading, and unnecessary spaces as well as replacing any double quotes with single quotes
Check for nulls
Check for duplicates
Check for inconsistencies in naming conventions (within reason).

Adjust field types

glimpse(anime_demo_table)

## Rows: 240
## Columns: 3
## $ tm_ky   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ id      <int> 10030, 10049, 10162, 10165, 10271, 10379, 1065, 10800, 11061, ~
## $ demo_id <int> 27, 27, 43, 27, 42, 25, 27, 43, 27, 42, 43, 25, 27, 27, 42, 27~

glimpse(anime_genres_table)

## Rows: 1,517
## Columns: 3
## $ tm_ky     <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ id        <int> 1, 1, 1, 1, 1, 1, 10030, 10030, 10030, 10049, 10049, 10049, ~
## $ genres_id <int> 1, 2, 24, 29, 4, 8, 22, 4, 8, 1, 37, 6, 1, 10, 37, 36, 23, 3~

glimpse(anime_ranking_table)

## Rows: 393
## Columns: 12
## $ tm_ky                        <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ id                           <int> 5114, 48583, 38524, 9253, 28977, 42938, 9~
## $ mean                         <dbl> 9.15, 9.15, 9.09, 9.09, 9.09, 9.06, 9.06,~
## $ rank                         <int> 1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 14, 16, 1~
## $ popularity                   <int> 3, 229, 32, 13, 336, 623, 381, 9, 684, 73~
## $ num_scoring_users            <int> 1690301, 144781, 1182762, 1152310, 178769~
## $ statistics.watching          <int> 212433, 381690, 66687, 139749, 56757, 318~
## $ statistics.completed         <int> 2017696, 138, 1519325, 1359955, 210439, 1~
## $ statistics.on_hold           <int> 94782, 3199, 7346, 75881, 20221, 4150, 12~
## $ statistics.dropped           <int> 42850, 1303, 4657, 44659, 14120, 2613, 84~
## $ statistics.plan_to_watch     <int> 413589, 266683, 119020, 541799, 207021, 1~
## $ statistics.num_scoring_users <int> 2781350, 653013, 1717035, 2162043, 508558~

glimpse(anime_studios_table)

## Rows: 412
## Columns: 3
## $ tm_ky     <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ id        <int> 1, 10030, 10049, 10087, 10162, 10165, 10271, 10379, 1065, 10~
## $ studio_id <int> 14, 7, 37, 43, 10, 2, 11, 112, 280, 36, 86, 112, 11, 103, 11~

glimpse(anime_syn_table)

## Rows: 555
## Columns: 3
## $ tm_ky    <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ id       <int> 5114, 5114, 5114, 5114, 48583, 48583, 28977, 42938, 42938, 42~
## $ synonyms <chr> "Hagane no Renkinjutsushi: Fullmetal Alchemist", "Fullmetal A~

glimpse(anime_table)

## Rows: 393
## Columns: 22
## $ tm_ky                 <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ id                    <int> 5114, 48583, 38524, 9253, 28977, 42938, 9969, 11~
## $ title                 <chr> "Fullmetal Alchemist: Brotherhood", "Shingeki no~
## $ main_picture.medium   <chr> "https://api-cdn.myanimelist.net/images/anime/12~
## $ main_picture.large    <chr> "https://api-cdn.myanimelist.net/images/anime/12~
## $ alternative_titles.en <chr> "Fullmetal Alchemist: Brotherhood", "Attack on T~
## $ alternative_titles.ja <chr> "<U+92FC><U+306E><U+932C><U+91D1><U+8853><U+5E2B~
## $ start_date            <chr> "2009-04-05", "2022-01-10", "2019-04-29", "2011-~
## $ end_date              <chr> "2010-07-04", NA, "2019-07-01", "2011-09-14", "2~
## $ synopsis              <chr> "After a horrific alchemy experiment goes wrong ~
## $ media_type            <chr> "tv", "tv", "tv", "tv", "tv", "tv", "tv", "tv", ~
## $ status                <chr> "finished_airing", "currently_airing", "finished~
## $ num_episodes          <int> 64, 0, 10, 24, 51, 13, 51, 148, 13, 12, 22, 24, ~
## $ start_season.year     <int> 2009, 2022, 2019, 2011, 2015, 2021, 2011, 2011, ~
## $ start_season.season   <chr> "spring", "winter", "spring", "spring", "spring"~
## $ rating                <chr> "r", "r", "r", "pg_13", "pg_13", "pg_13", "pg_13~
## $ nsfw                  <chr> "white", "white", "white", "white", "white", "wh~
## $ demo_de               <chr> "Shounen", "Shounen", "Shounen", NA, "Shounen", ~
## $ genres_de             <chr> "Action,Fantasy,Adventure,Military,Comedy,Drama"~
## $ studios_de            <chr> "Bones", "MAPPA", "Wit Studio", "White Fox", "Ba~
## $ synonyms              <chr> "Hagane no Renkinjutsushi: Fullmetal Alchemist,F~
## $ alternative_title     <chr> "Fullmetal Alchemist: Brotherhood", "Attack on T~

glimpse(demo_l)

## Rows: 5
## Columns: 3
## $ tm_ky   <int> 2, 2, 2, 2, 2
## $ demo_id <int> 15, 25, 27, 42, 43
## $ demo_de <chr> "Kids", "Shoujo", "Shounen", "Seinen", "Josei"

glimpse(genres_l)

## Rows: 35
## Columns: 3
## $ tm_ky     <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ genres_id <int> 1, 10, 11, 13, 14, 17, 18, 19, 2, 20, 21, 22, 23, 24, 26, 28~
## $ genres_de <chr> "Action", "Fantasy", "Game", "Historical", "Horror", "Martia~

glimpse(rank_table)

## Rows: 78,336
## Columns: 5
## $ tm_ky         <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ id            <int> 5114, 48583, 38524, 9253, 28977, 42938, 9969, 39486, 110~
## $ title         <chr> "Fullmetal Alchemist: Brotherhood", "Shingeki no Kyojin:~
## $ rank          <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ rank_category <chr> "all", "all", "all", "all", "all", "all", "all", "all", ~

glimpse(studios_l)

## Rows: 82
## Columns: 3
## $ tm_ky     <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ studio_id <int> 1, 10, 101, 103, 1075, 11, 110, 1103, 1109, 1119, 112, 1129,~
## $ studio_de <chr> "Studio Pierrot", "Production I.G", "Studio Hibari", "Tatsun~

glimpse(tm_ky)

## Rows: 1
## Columns: 2
## $ tm_ky    <int> 2
## $ tm_ky_de <chr> "2/11/22"

Even though the data downloaded had been primarily character based, when the files are read into R, the correct field types are used.

It looks like no changes will be needed at this point.

Cleaning up column names

I want to change all of the id fields to mal_id.

names(anime_demo_table)[names(anime_demo_table) == "id"] = "mal_id"
names(anime_genres_table)[names(anime_genres_table) == "id"] = "mal_id"
names(anime_ranking_table)[names(anime_ranking_table) == "id"] = "mal_id"
names(anime_studios_table)[names(anime_studios_table) == "id"] = "mal_id"
names(anime_syn_table)[names(anime_syn_table) == "id"] = "mal_id"
names(anime_table)[names(anime_table) == "id"] = "mal_id"
names(rank_table)[names(rank_table) == "id"] = "mal_id"

These are the column names that I’m expecting in each table:

anime_demo_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
demo_id	int

anime_genres_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
genres_id	int	PK

anime_ranking_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
mean	dbl
rank	int
popularity	int
num_scoring_users	int
statistics.watching	int
statistics.completed	int
statistics.on_hold	int
statistics.dropped	int
statistics.plan_to_watch	int
statistics.num_scoring_users	int

anime_studios_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
studio_id	int	PK

anime_syn_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
synonyms	chr

anime_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
title	chr
main_picture.medium	chr
main_picture.large	chr
alternative_titles.en	chr
alternative_titles.ja	chr
start_date	chr
end_date	chr
synopsis	chr
media_type	chr
status	chr
num_episodes	int
start_season.year	int
start_season.season	chr
rating	chr
nsfw	chr
demo_de	chr
genres_de	chr
studios_de	chr
synonyms	chr
alternative_title	chr

rank_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
title	chr
rank	int
rank_category	chr	PK

demo_l

Field	Type	Primary Key
tm_ky	int	PK
demo_id	int	PK
demo_de	chr

genres_l

Field	Type	Primary Key
tm_ky	int	PK
genres_id	int	PK
genres_de	chr

studios_l

Field	Type	Primary Key
tm_ky	int	PK
studio_id	int	PK
studio_de	chr

tm_ky

Field	Type	Primary Key
tm_ky	int	PK
tm_ky_de	chr

anime_demo_table_Col <- c('tm_ky','mal_id','demo_id')
anime_genres_table_Col <- c('tm_ky','mal_id','genres_id')
anime_ranking_table_Col <- c('tm_ky','mal_id','mean','rank','popularity','num_scoring_users','statistics.watching','statistics.completed','statistics.on_hold','statistics.dropped','statistics.plan_to_watch','statistics.num_scoring_users')
anime_studios_table_Col <- c('tm_ky','mal_id','studio_id')
anime_syn_table_Col <- c('tm_ky','mal_id','synonyms')
anime_table_Col <- c('tm_ky','mal_id','title','main_picture.medium','main_picture.large','alternative_titles.en','alternative_titles.ja','start_date','end_date','synopsis','media_type','status','num_episodes','start_season.year','start_season.season','rating','nsfw','demo_de','genres_de','studios_de','synonyms','alternative_title')
rank_table_Col <- c('tm_ky','mal_id','title','rank','rank_category') 
demo_l_Col <- c('tm_ky','demo_id','demo_de')
genres_l_col <- c('tm_ky','genres_id','genres_de')
studios_l_Col <- c('tm_ky','studio_id','studio_de')
tm_ky_Col <- c('tm_ky','tm_ky_de')

identical(colnames(anime_demo_table), anime_demo_table_Col)

## [1] TRUE

identical(colnames(anime_genres_table), anime_genres_table_Col)

## [1] TRUE

identical(colnames(anime_ranking_table), anime_ranking_table_Col)

## [1] TRUE

identical(colnames(anime_studios_table), anime_studios_table_Col)

## [1] TRUE

identical(colnames(anime_syn_table), anime_syn_table_Col)

## [1] TRUE

identical(colnames(anime_table), anime_table_Col)

## [1] TRUE

identical(colnames(rank_table), rank_table_Col)

## [1] TRUE

identical(colnames(demo_l), demo_l_Col)

## [1] TRUE

identical(colnames(genres_l), genres_l_col)

## [1] TRUE

identical(colnames(studios_l), studios_l_Col)

## [1] TRUE

identical(colnames(tm_ky), tm_ky_Col)

## [1] TRUE

All of the column names match expectations, so we are good!

Cleaning up strings

There is one situation that will cause issues when uploading onto tableau. If the field includes characters for new line (\n) or carriage returns (\r) the table loading process will fail.

I need to search through all tables that contain character fields and confirm that they don’t have those characters.

tables1 <- list(anime_demo_table=anime_demo_table, anime_genres_table=anime_genres_table, anime_ranking_table=anime_ranking_table, anime_studios_table=anime_studios_table, anime_syn_table=anime_syn_table, anime_table=anime_table, rank_table=rank_table, demo_l=demo_l, genres_l=genres_l, studios_l=studios_l)

find_character <- function(df, dfName){
  for (i in 1:ncol(df)){
    if (is.character(df[,i])){
      if (TRUE %in% grep("\n",df[, i]) | TRUE %in% grep("\r",df[, i])){
        if (TRUE %in% grep("\n",df[, i])){
          print(paste(dfName, "-", names(df)[i], "column has the new line character"))
        }
        if (TRUE %in% grep("\r",df[, i])){
          print(paste(dfName, "-", names(df)[i], "column has the carriage return character"))
        }
      }
    }
  }
}

for (i in 1:length(tables1)){
  find_character(tables1[[i]], names(tables1)[i])
}

## [1] "anime_table - synopsis column has the new line character"

tables1 <- NULL

It appears that only the synopsis column contains the new line character, so lets get rid of those.

anime_table$synopsis <- sapply(anime_table$synopsis, 
                             function(x) {gsub("[\n]","", x) })

Next, there are four character strings that I need to adjust for:

Eliminate leading spaces
Eliminate trailing spaces
Eliminate unnecessary spaces
Eliminate any double quotes and replace with single quotes

The following tables have character columns:

anime_syn_table
anime_table
rank_table
demo_l
genres_l
studios_l

strFunctions <- function(df){
  df <- df %>%
    mutate(across(where(is.character), str_trim)) %>%
    mutate(across(where(is.character), str_squish)) %>%
    mutate(across(where(is.character), gsub, pattern='"', replacement="'"))
  return(df)
}

anime_syn_table <- strFunctions(anime_syn_table)
anime_table <- strFunctions(anime_table)
rank_table <- strFunctions(rank_table)
demo_l <- strFunctions(demo_l)
genres_l <- strFunctions(genres_l)
studios_l <- strFunctions(studios_l)

Check for nulls

I will check for nulls in each table and assess whether or not I need to address.

lapply(anime_demo_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $demo_id
## [1] 0

lapply(anime_genres_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $genres_id
## [1] 0

lapply(anime_ranking_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $mean
## [1] 0
## 
## $rank
## [1] 0
## 
## $popularity
## [1] 0
## 
## $num_scoring_users
## [1] 0
## 
## $statistics.watching
## [1] 0
## 
## $statistics.completed
## [1] 0
## 
## $statistics.on_hold
## [1] 0
## 
## $statistics.dropped
## [1] 0
## 
## $statistics.plan_to_watch
## [1] 0
## 
## $statistics.num_scoring_users
## [1] 0

lapply(anime_studios_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $studio_id
## [1] 0

lapply(anime_syn_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $synonyms
## [1] 0

lapply(anime_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $title
## [1] 0
## 
## $main_picture.medium
## [1] 0
## 
## $main_picture.large
## [1] 0
## 
## $alternative_titles.en
## [1] 0
## 
## $alternative_titles.ja
## [1] 0
## 
## $start_date
## [1] 0
## 
## $end_date
## [1] 8
## 
## $synopsis
## [1] 0
## 
## $media_type
## [1] 0
## 
## $status
## [1] 0
## 
## $num_episodes
## [1] 0
## 
## $start_season.year
## [1] 0
## 
## $start_season.season
## [1] 0
## 
## $rating
## [1] 0
## 
## $nsfw
## [1] 0
## 
## $demo_de
## [1] 153
## 
## $genres_de
## [1] 0
## 
## $studios_de
## [1] 0
## 
## $synonyms
## [1] 92
## 
## $alternative_title
## [1] 0

Ok, so the following fields contain nulls in the anime_table:

end_date
demo_de
synonyms

I plan on incorporating the demographic into my analysis, because of this, I will replace the null values in this field with missing.

anime_table$demo_de <- anime_table$demo_de %>%
  replace_na('missing')

# if I need to replace multiple columns, use the below code
# anime_table <- anime_table %>%
#   replace_na(list(x='missing', y = 'none'))

# When I run this for the first full time I will also encounter missing values in start_season_year which will require a substitution of the first four characters from the start_date

# anime_table <- anime_table %>%
#   mutate(start_season.year = case_when(
#     is.na(start_season.year) ~ as.integer(substr(start_date, 1, 4)),
#     !is.na(start_season.year) ~ start_season.year))
# lapply(anime_table,function(x) { length(which(is.na(x)))})

lapply(rank_table,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $mal_id
## [1] 0
## 
## $title
## [1] 0
## 
## $rank
## [1] 0
## 
## $rank_category
## [1] 0

lapply(demo_l,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $demo_id
## [1] 0
## 
## $demo_de
## [1] 0

lapply(genres_l,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $genres_id
## [1] 0
## 
## $genres_de
## [1] 0

lapply(studios_l,function(x) { length(which(is.na(x)))})

## $tm_ky
## [1] 0
## 
## $studio_id
## [1] 0
## 
## $studio_de
## [1] 0

All other tables look ok, so we’re good to move onto the next step.

Check for duplicates

I’ll check for duplicates for all fields first, then check individual fields where I wouldn’t expect duplicates (primary keys and potentially other fields).

anime_demo_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
demo_id	int

get_dupes(anime_genres_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, genres_id

## [1] tm_ky      mal_id     genres_id  dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(anime_genres_table, mal_id, genres_id)

## No duplicate combinations found of: mal_id, genres_id

## [1] mal_id     genres_id  dupe_count tm_ky     
## <0 rows> (or 0-length row.names)

anime_genres_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
genres_id	int	PK

get_dupes(anime_genres_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, genres_id

## [1] tm_ky      mal_id     genres_id  dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(anime_genres_table, mal_id, genres_id)

## No duplicate combinations found of: mal_id, genres_id

## [1] mal_id     genres_id  dupe_count tm_ky     
## <0 rows> (or 0-length row.names)

anime_ranking_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
mean	dbl
rank	int
popularity	int
num_scoring_users	int
statistics.watching	int
statistics.completed	int
statistics.on_hold	int
statistics.dropped	int
statistics.plan_to_watch	int
statistics.num_scoring_users	int

get_dupes(anime_ranking_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, mean, rank, popularity, num_scoring_users, statistics.watching, statistics.completed, statistics.on_hold, ... and 3 other variables

##  [1] tm_ky                        mal_id                      
##  [3] mean                         rank                        
##  [5] popularity                   num_scoring_users           
##  [7] statistics.watching          statistics.completed        
##  [9] statistics.on_hold           statistics.dropped          
## [11] statistics.plan_to_watch     statistics.num_scoring_users
## [13] dupe_count                  
## <0 rows> (or 0-length row.names)

get_dupes(anime_ranking_table, mal_id)

## No duplicate combinations found of: mal_id

##  [1] mal_id                       dupe_count                  
##  [3] tm_ky                        mean                        
##  [5] rank                         popularity                  
##  [7] num_scoring_users            statistics.watching         
##  [9] statistics.completed         statistics.on_hold          
## [11] statistics.dropped           statistics.plan_to_watch    
## [13] statistics.num_scoring_users
## <0 rows> (or 0-length row.names)

get_dupes(anime_ranking_table, rank)

## No duplicate combinations found of: rank

##  [1] rank                         dupe_count                  
##  [3] tm_ky                        mal_id                      
##  [5] mean                         popularity                  
##  [7] num_scoring_users            statistics.watching         
##  [9] statistics.completed         statistics.on_hold          
## [11] statistics.dropped           statistics.plan_to_watch    
## [13] statistics.num_scoring_users
## <0 rows> (or 0-length row.names)

get_dupes(anime_ranking_table, popularity)

## No duplicate combinations found of: popularity

##  [1] popularity                   dupe_count                  
##  [3] tm_ky                        mal_id                      
##  [5] mean                         rank                        
##  [7] num_scoring_users            statistics.watching         
##  [9] statistics.completed         statistics.on_hold          
## [11] statistics.dropped           statistics.plan_to_watch    
## [13] statistics.num_scoring_users
## <0 rows> (or 0-length row.names)

There can be duplicates for both rank and popularity (even though they should be unique. The data download occurs over the space of ~1 hour. Because of this, rankings may change slightly while the download is occuring resulting in duplicates or gaps.

Since I am not including the popularity or rank as items in my download, this is ok, however, if anyone is using these fields it should be a caveat.

anime_studios_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
studio_id	int	PK

get_dupes(anime_studios_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, studio_id

## [1] tm_ky      mal_id     studio_id  dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(anime_studios_table, mal_id, studio_id)

## No duplicate combinations found of: mal_id, studio_id

## [1] mal_id     studio_id  dupe_count tm_ky     
## <0 rows> (or 0-length row.names)

anime_syn_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
synonyms	chr

get_dupes(anime_syn_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, synonyms

## [1] tm_ky      mal_id     synonyms   dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(anime_syn_table, mal_id, synonyms)

## No duplicate combinations found of: mal_id, synonyms

## [1] mal_id     synonyms   dupe_count tm_ky     
## <0 rows> (or 0-length row.names)

Interestingly, in some cases, there can be duplicates of the synonyms. These are present on the website, however, I can exclude them here

anime_syn_table <- anime_syn_table %>%
  distinct(tm_ky, mal_id, synonyms)

anime_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
title	chr
main_picture.medium	chr
main_picture.large	chr
alternative_titles.en	chr
alternative_titles.ja	chr
start_date	chr
end_date	chr
synopsis	chr
media_type	chr
status	chr
num_episodes	int
start_season.year	int
start_season.season	chr
rating	chr
nsfw	chr
demo_de	chr
genres_de	chr
studios_de	chr
synonyms	chr
alternative_title	chr

get_dupes(anime_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, title, main_picture.medium, main_picture.large, alternative_titles.en, alternative_titles.ja, start_date, end_date, ... and 13 other variables

##  [1] tm_ky                 mal_id                title                
##  [4] main_picture.medium   main_picture.large    alternative_titles.en
##  [7] alternative_titles.ja start_date            end_date             
## [10] synopsis              media_type            status               
## [13] num_episodes          start_season.year     start_season.season  
## [16] rating                nsfw                  demo_de              
## [19] genres_de             studios_de            synonyms             
## [22] alternative_title     dupe_count           
## <0 rows> (or 0-length row.names)

get_dupes(anime_table, mal_id)

## No duplicate combinations found of: mal_id

##  [1] mal_id                dupe_count            tm_ky                
##  [4] title                 main_picture.medium   main_picture.large   
##  [7] alternative_titles.en alternative_titles.ja start_date           
## [10] end_date              synopsis              media_type           
## [13] status                num_episodes          start_season.year    
## [16] start_season.season   rating                nsfw                 
## [19] demo_de               genres_de             studios_de           
## [22] synonyms              alternative_title    
## <0 rows> (or 0-length row.names)

rank_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
title	chr
rank	int
rank_category	chr	PK

get_dupes(rank_table)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, mal_id, title, rank, rank_category

## [1] tm_ky         mal_id        title         rank          rank_category
## [6] dupe_count   
## <0 rows> (or 0-length row.names)

get_dupes(rank_table, mal_id, rank_category)

##   mal_id rank_category dupe_count tm_ky        title rank
## 1  11079      favorite          2     2 Kill Me Baby 1500
## 2  11079      favorite          2     2 Kill Me Baby 1501

get_dupes(rank_table, mal_id, rank_category)

##   mal_id rank_category dupe_count tm_ky        title rank
## 1  11079      favorite          2     2 Kill Me Baby 1500
## 2  11079      favorite          2     2 Kill Me Baby 1501

get_dupes(rank_table, rank_category, rank)

## No duplicate combinations found of: rank_category, rank

## [1] rank_category rank          dupe_count    tm_ky         mal_id       
## [6] title        
## <0 rows> (or 0-length row.names)

Interestingly, there are no duplicates for rank. I assume that because this download is pretty fast, there isn’t time for dynamic changes in rank as time passes.

demo_l

Field	Type	Primary Key
tm_ky	int	PK
demo_id	int	PK
demo_de	chr

get_dupes(demo_l)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, demo_id, demo_de

## [1] tm_ky      demo_id    demo_de    dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(demo_l, demo_id)

## No duplicate combinations found of: demo_id

## [1] demo_id    dupe_count tm_ky      demo_de   
## <0 rows> (or 0-length row.names)

genres_l

Field	Type	Primary Key
tm_ky	int	PK
genres_id	int	PK
genres_de	chr

get_dupes(genres_l)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, genres_id, genres_de

## [1] tm_ky      genres_id  genres_de  dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(genres_l, genres_id)

## No duplicate combinations found of: genres_id

## [1] genres_id  dupe_count tm_ky      genres_de 
## <0 rows> (or 0-length row.names)

studios_l

Field	Type	Primary Key
tm_ky	int	PK
studio_id	int	PK
studio_de	chr

get_dupes(studios_l)

## No variable names specified - using all columns.

## No duplicate combinations found of: tm_ky, studio_id, studio_de

## [1] tm_ky      studio_id  studio_de  dupe_count
## <0 rows> (or 0-length row.names)

get_dupes(studios_l, studio_id)

## No duplicate combinations found of: studio_id

## [1] studio_id  dupe_count tm_ky      studio_de 
## <0 rows> (or 0-length row.names)

Check for inconsistencies in names

There are only a few fields that I can review for consistency. While I might want to ensure that every title doesn’t have a slightly different name, it just isn’t feasible at this point. As a result, I will take a focused approach to each table

anime_demo_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
demo_id	int

tabyl(anime_demo_table, demo_id)

##  demo_id   n     percent
##       15   2 0.008333333
##       25  29 0.120833333
##       27 138 0.575000000
##       42  58 0.241666667
##       43  13 0.054166667

anime_genres_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
genres_id	int	PK

tabyl(anime_genres_table, genres_id)

anime_ranking_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
mean	dbl
rank	int
popularity	int
num_scoring_users	int
statistics.watching	int
statistics.completed	int
statistics.on_hold	int
statistics.dropped	int
statistics.plan_to_watch	int
statistics.num_scoring_users	int

anime_studios_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
studio_id	int	PK

tabyl(anime_studios_table, studio_id)

anime_syn_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
synonyms	chr

anime_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
title	chr
main_picture.medium	chr
main_picture.large	chr
alternative_titles.en	chr
alternative_titles.ja	chr
start_date	chr
end_date	chr
synopsis	chr
media_type	chr
status	chr
num_episodes	int
start_season.year	int
start_season.season	chr
rating	chr
nsfw	chr
demo_de	chr
genres_de	chr
studios_de	chr
synonyms	chr
alternative_title	chr

tabyl(anime_table, media_type)

##  media_type   n percent
##          tv 393       1

tabyl(anime_table, status)

##            status   n    percent
##  currently_airing   9 0.02290076
##   finished_airing 384 0.97709924

tabyl(anime_table, start_season.year)

##  start_season.year  n     percent
##               1970  1 0.002544529
##               1978  2 0.005089059
##               1979  1 0.002544529
##               1980  1 0.002544529
##               1982  1 0.002544529
##               1984  1 0.002544529
##               1985  1 0.002544529
##               1986  2 0.005089059
##               1987  1 0.002544529
##               1988  1 0.002544529
##               1989  1 0.002544529
##               1990  1 0.002544529
##               1992  1 0.002544529
##               1993  1 0.002544529
##               1995  2 0.005089059
##               1996  6 0.015267176
##               1997  2 0.005089059
##               1998  5 0.012722646
##               1999  4 0.010178117
##               2000  1 0.002544529
##               2001  1 0.002544529
##               2002  8 0.020356234
##               2003  5 0.012722646
##               2004  7 0.017811705
##               2005  8 0.020356234
##               2006 18 0.045801527
##               2007 17 0.043256997
##               2008 13 0.033078880
##               2009 14 0.035623410
##               2010 13 0.033078880
##               2011 19 0.048346056
##               2012 22 0.055979644
##               2013 18 0.045801527
##               2014 21 0.053435115
##               2015 27 0.068702290
##               2016 20 0.050890585
##               2017 20 0.050890585
##               2018 22 0.055979644
##               2019 23 0.058524173
##               2020 21 0.053435115
##               2021 34 0.086513995
##               2022  6 0.015267176

Surprisingly, the number of anime decreased after 2018 and 2019, but increased a bit in 2021.

tabyl(anime_table, start_season.season)

##  start_season.season   n   percent
##                 fall 131 0.3333333
##               spring 123 0.3129771
##               summer  57 0.1450382
##               winter  82 0.2086514

tabyl(anime_table, rating)

##  rating   n     percent
##       g   7 0.017811705
##      pg   1 0.002544529
##   pg_13 269 0.684478372
##       r 116 0.295165394

tabyl(anime_table, nsfw)

##   nsfw   n     percent
##   gray   1 0.002544529
##  white 392 0.997455471

rank_table

Field	Type	Primary Key
tm_ky	int	PK
mal_id	int	PK
title	chr
rank	int
rank_category	chr	PK

tabyl(rank_table, rank_category)

##  rank_category     n     percent
##         airing   298 0.003804126
##            all 19380 0.247395833
##   bypopularity 19380 0.247395833
##       favorite 23515 0.300181270
##          movie  3501 0.044692096
##            ova  3944 0.050347222
##        special  2313 0.029526654
##             tv  5583 0.071269914
##       upcoming   422 0.005387051

demo_l

Field	Type	Primary Key
tm_ky	int	PK
demo_id	int	PK
demo_de	chr

tabyl(demo_l, demo_de)

##  demo_de n percent
##    Josei 1     0.2
##     Kids 1     0.2
##   Seinen 1     0.2
##   Shoujo 1     0.2
##  Shounen 1     0.2

genres_l

Field	Type	Primary Key
tm_ky	int	PK
genres_id	int	PK
genres_de	chr

tabyl(genres_l, genres_de)

##      genres_de n    percent
##         Action 1 0.02857143
##      Adventure 1 0.02857143
##    Avant Garde 1 0.02857143
##      Boys Love 1 0.02857143
##           Cars 1 0.02857143
##         Comedy 1 0.02857143
##         Demons 1 0.02857143
##          Drama 1 0.02857143
##          Ecchi 1 0.02857143
##        Fantasy 1 0.02857143
##           Game 1 0.02857143
##     Girls Love 1 0.02857143
##        Gourmet 1 0.02857143
##          Harem 1 0.02857143
##     Historical 1 0.02857143
##         Horror 1 0.02857143
##   Martial Arts 1 0.02857143
##          Mecha 1 0.02857143
##       Military 1 0.02857143
##          Music 1 0.02857143
##        Mystery 1 0.02857143
##         Parody 1 0.02857143
##         Police 1 0.02857143
##  Psychological 1 0.02857143
##        Romance 1 0.02857143
##        Samurai 1 0.02857143
##         School 1 0.02857143
##         Sci-Fi 1 0.02857143
##  Slice of Life 1 0.02857143
##          Space 1 0.02857143
##         Sports 1 0.02857143
##    Super Power 1 0.02857143
##   Supernatural 1 0.02857143
##       Suspense 1 0.02857143
##        Vampire 1 0.02857143

studios_l

Field	Type	Primary Key
tm_ky	int	PK
studio_id	int	PK
studio_de	chr

tabyl(studios_l, studio_de)

##                 studio_de n    percent
##                      8bit 1 0.01219512
##              A-1 Pictures 1 0.01219512
##                  A.C.G.T. 1 0.01219512
##                   Ajia-Do 1 0.01219512
##                   Artland 1 0.01219512
##     Bandai Namco Pictures 1 0.01219512
##                 Bee Train 1 0.01219512
##  Bibury Animation Studios 1 0.01219512
##                     Bones 1 0.01219512
##              Brain's Base 1 0.01219512
##                 C-Station 1 0.01219512
##               CloverWorks 1 0.01219512
##          David Production 1 0.01219512
##                 Doga Kobo 1 0.01219512
##                 E&G Films 1 0.01219512
##                  Egg Firm 1 0.01219512
##           Encourage Films 1 0.01219512
##                     feel. 1 0.01219512
##                    Gainax 1 0.01219512
##                    Gallop 1 0.01219512
##               Geno Studio 1 0.01219512
##                     Gonzo 1 0.01219512
##                 Group TAC 1 0.01219512
##            Hal Film Maker 1 0.01219512
##     Hoods Drifters Studio 1 0.01219512
##                    Imagin 1 0.01219512
##                 J.C.Staff 1 0.01219512
##             Kinema Citrus 1 0.01219512
##           Kyoto Animation 1 0.01219512
##                    Lerche 1 0.01219512
##                LIDENFILMS 1 0.01219512
##                  Madhouse 1 0.01219512
##                  Manglobe 1 0.01219512
##                     MAPPA 1 0.01219512
##                Marvy Jack 1 0.01219512
##          Mushi Production 1 0.01219512
##          Nippon Animation 1 0.01219512
##                     Nomad 1 0.01219512
##                       Nut 1 0.01219512
##                       OLM 1 0.01219512
##                    Orange 1 0.01219512
##                P.A. Works 1 0.01219512
##                  P.I.C.S. 1 0.01219512
##                    Pastel 1 0.01219512
##              Pierrot Plus 1 0.01219512
##           Platinum Vision 1 0.01219512
##            Production I.G 1 0.01219512
##                     Radix 1 0.01219512
##                  SANZIGEN 1 0.01219512
##                 Satelight 1 0.01219512
##              Science SARU 1 0.01219512
##                Seven Arcs 1 0.01219512
##                     Shaft 1 0.01219512
##         Shin-Ei Animation 1 0.01219512
##                     Shuka 1 0.01219512
##              SILVER LINK. 1 0.01219512
##               Studio Bind 1 0.01219512
##              Studio Comet 1 0.01219512
##               Studio Deen 1 0.01219512
##             Studio Hibari 1 0.01219512
##              Studio Junio 1 0.01219512
##                Studio Kai 1 0.01219512
##            Studio Pierrot 1 0.01219512
##           Studio Signpost 1 0.01219512
##                   Sunrise 1 0.01219512
##                 SynergySP 1 0.01219512
##      Tatsunoko Production 1 0.01219512
##    Telecom Animation Film 1 0.01219512
##                Telescreen 1 0.01219512
##        Tezuka Productions 1 0.01219512
##         TMS Entertainment 1 0.01219512
##            Toei Animation 1 0.01219512
##       Tokyo Movie Shinsha 1 0.01219512
##                Trans Arts 1 0.01219512
##            Triangle Staff 1 0.01219512
##                   Trigger 1 0.01219512
##                    TROYCA 1 0.01219512
##            TYO Animations 1 0.01219512
##                  ufotable 1 0.01219512
##                 White Fox 1 0.01219512
##                Wit Studio 1 0.01219512
##                    Zero-G 1 0.01219512

Exporting the final files

anime_demo_table
anime_genres_table
anime_ranking_table
anime_studios_table
anime_syn_table
anime_table
rank_table
demo_l
genres_l
studios_l
tm_ky

exportFile <- function(file_name, df){
  if (file_test("-f", file_name)) {
    write.table(df, file = file_name, sep = ",", append = TRUE, quote = TRUE, row.names = FALSE, col.names = FALSE)
  } else {
    write.csv(df, file_name, row.names = FALSE)  
  }
}

exportFile(paste0("anime_demo_table",".csv"),anime_demo_table)
exportFile(paste0("anime_genres_table",".csv"),anime_genres_table)
exportFile(paste0("anime_ranking_table",".csv"),anime_ranking_table)
exportFile(paste0("anime_studios_table",".csv"),anime_studios_table)
exportFile(paste0("anime_syn_table",".csv"),anime_syn_table)
exportFile(paste0("anime_table",".csv"),anime_table)
exportFile(paste0("rank_table",".csv"),rank_table)
exportFile(paste0("demo_l",".csv"),demo_l)
exportFile(paste0("genres_l",".csv"),genres_l)
exportFile(paste0("studios_l",".csv"),studios_l)
exportFile(paste0("tm_ky",".csv"),tm_ky)

Conclusion

At this point, all of the files have been written/appended to the tables. Now they must be uploaded to the Tableau Public server and Kaggle.com here

patmendoza330 / animelistclean