Show the code
# Loading necessary libraries ####
library(tidyverse)
library(here)
library(janitor)
library(readxl)
library(writexl)
library(shiny)
library(DT)
library(flextable)
# Loading datasets ####
file_kanji <- "Kanji_20240227_081842.csv"
file_jukugo <- "Jukugo_20240227_081908.csv"
file_words <- "Optimized Kore - Sheet1.csv"
file_sentences1 <- "Sentences.xlsx"
file_sentences2 <- "Sentences_core.xlsx"
## Kanji list ####
data_kanji <- read.csv2(here("data/kanji", file_kanji)) %>% 
  clean_names()
## Jukugo list ####
data_jukugo <- read.csv2(here("data/kanji", file_jukugo)) %>% 
  clean_names() %>% 
  arrange(desc(frequency)) %>% 
  select(comp_word, pronunciation, english_translation) %>%
  rename(word = comp_word,
         meaning = english_translation) %>% 
  distinct(word, .keep_all = TRUE) %>% 
  # Change romaji
  mutate(
    pronunciation = gsub("zi", "ji", pronunciation),
    pronunciation = gsub("zy", "jy", pronunciation),
    pronunciation = gsub("ti", "chi", pronunciation),
    pronunciation = gsub("ty", "ch", pronunciation),
    pronunciation = gsub("si", "shi", pronunciation),
    pronunciation = gsub("sy", "shy", pronunciation),
    pronunciation = gsub("tu", "tsu", pronunciation),
    pronunciation = gsub("hu", "fu", pronunciation)
    )
## word list ####
data_words <- read.csv(here("data/kanji", file_words)) %>% 
  clean_names() %>% 
  arrange(core_index) %>% 
  select(vocab_expression, vocab_kana, vocab_meaning) %>% 
  rename(word = vocab_expression,
         pronunciation = vocab_kana,
         meaning = vocab_meaning) %>% 
  distinct(word, .keep_all = TRUE) 
## Number list ####
data_numbers <- data.frame(
  word = c("零", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", 
     "十一", "十二", "十三", "十四", "十五", "十六", "十七", "十八", "十九", 
     "二十", "二十一", "二十二", "二十三", "二十四", "二十五", "二十六", "二十七", 
     "二十八", "二十九", "三十", "三十一", "三十二", "三十三", "三十四", "三十五", 
     "三十六", "三十七", "三十八", "三十九", "四十", "四十一", "四十二", "四十三", 
     "四十四", "四十五", "四十六", "四十七", "四十八", "四十九", "五十", "五十一", 
     "五十二", "五十三", "五十四", "五十五", "五十六", "五十七", "五十八", "五十九", 
     "六十", "六十一", "六十二", "六十三", "六十四", "六十五", "六十六", "六十七", 
     "六十八", "六十九", "七十", "七十一", "七十二", "七十三", "七十四", "七十五", 
     "七十六", "七十七", "七十八", "七十九", "八十", "八十一", "八十二", "八十三", 
     "八十四", "八十五", "八十六", "八十七", "八十八", "八十九", "九十", "九十一", 
     "九十二", "九十三", "九十四", "九十五", "九十六", "九十七", "九十八", "九十九", "百"),
  pronunciation = c("れい", "いち", "に", "さん", "し", "ご", "ろく", "しち", "はち", "きゅう", "じゅう",
                    "じゅういち", "じゅうに", "じゅうさん", "じゅうし", "じゅうご", "じゅうろく", "じゅうしち", "じゅうはち", "じゅうきゅう",
                    "にじゅう", "にじゅういち", "にじゅうに", "にじゅうさん", "にじゅうし", "にじゅうご", "にじゅうろく", "にじゅうしち", "にじゅうはち", "にじゅうきゅう",
                    "さんじゅう", "さんじゅういち", "さんじゅうに", "さんじゅうさん", "さんじゅうし", "さんじゅうご", "さんじゅうろく", "さんじゅうしち", "さんじゅうはち", "さんじゅうきゅう",
                    "よんじゅう", "よんじゅういち", "よんじゅうに", "よんじゅうさん", "よんじゅうし", "よんじゅうご", "よんじゅうろく", "よんじゅうしち", "よんじゅうはち", "よんじゅうきゅう",
                    "ごじゅう", "ごじゅういち", "ごじゅうに", "ごじゅうさん", "ごじゅうし", "ごじゅうご", "ごじゅうろく", "ごじゅうしち", "ごじゅうはち", "ごじゅうきゅう",
                    "ろくじゅう", "ろくじゅういち", "ろくじゅうに", "ろくじゅうさん", "ろくじゅうし", "ろくじゅうご", "ろくじゅうろく", "ろくじゅうしち", "ろくじゅうはち", "ろくじゅうきゅう",
                    "しちじゅう", "しちじゅういち", "しちじゅうに", "しちじゅうさん", "しちじゅうし", "しちじゅうご", "しちじゅうろく", "しちじゅうしち", "しちじゅうはち", "しちじゅうきゅう",
                    "はちじゅう", "はちじゅういち", "はちじゅうに", "はちじゅうさん", "はちじゅうし", "はちじゅうご", "はちじゅうろく", "はちじゅうしち", "はちじゅうはち", "はちじゅうきゅう",
                    "きゅうじゅう", "きゅうじゅういち", "きゅうじゅうに", "きゅうじゅうさん", "きゅうじゅうし", "きゅうじゅうご", "きゅうじゅうろく", "きゅうじゅうしち", "きゅうじゅうはち", "きゅうじゅうきゅう",
                    "ひゃく"),
  meaning = as.character(0:100)
  )
# Sentence list ####
data_sentences1 <- read_excel(here("data/kanji", file_sentences1)) 
data_sentences2 <- read_excel(here("data/kanji", file_sentences2))
data_sentences <- rbind(data_sentences1, data_sentences2)
rm(data_sentences1, data_sentences2)
# Extract kana ####
hiragana_chars <- intToUtf8(seq(12353, 12438)) # Unicode range for hiragana characters
katakana_chars <- intToUtf8(seq(12448, 12543)) # Unicode range for katakana characters
kana <- paste0(hiragana_chars, katakana_chars) %>% 
  str_split_1(pattern = "") %>% 
  paste(collapse = "|")
# All data ####
data_bind <- data_words %>% 
  rbind(data_sentences %>% select(word, pronunciation, meaning),
        data_numbers,
        data_jukugo) %>% 
  mutate(word = str_remove_all(word, "[ a-zA-Z]")) %>% 
  distinct(word, .keep_all = TRUE)
# Extract kanji ####
all_kanji <- data_kanji %>% 
  select(kanji) %>% 
  rbind(data_bind %>% 
          mutate(kanji = word, .keep = "none")) %>% 
  mutate(kanji = str_remove_all(kanji, kana)) %>% 
  filter(kanji != "") %>% 
  pull() %>% 
  paste(collapse = "") %>% 
  str_split_1(pattern = "") %>% 
  unique()
  
## Separate individual kanji from all words ####
data_all <- data_bind %>% 
  mutate(kanji = str_remove_all(word, kana)) %>% 
  separate_wider_position(kanji, widths = c("kanji_1" = 1, 
                                            "kanji_2" = 2,
                                            "kanji_3" = 3,
                                            "kanji_4" = 4,
                                            "kanji_5" = 5),
                          too_few = "align_start") %>% 
  filter(!is.na(kanji_1))
flextable(head(data_all))| word | pronunciation | meaning | kanji_1 | kanji_2 | kanji_3 | kanji_4 | kanji_5 | 
|---|---|---|---|---|---|---|---|
| 見る | みる | see, look at | 見 | ||||
| 円 | えん | circle | 円 | ||||
| 多い | おおい | lots of | 多 | ||||
| 家 | うち | house, home | 家 | ||||
| 新しい | あたらしい | new | 新 | ||||
| 私 | わたし | I | 私 |