# Set working directory 
setwd("D:/NextCloud/HMBD_partage/Data to put online/CAN")

# ******************************************************************************
#                                CANADA
# 
#  This file provides the calculations performed on the input data for 
#  Canada, as used in the construction of the Human Multiple Births Database
# 
# ******************************************************************************

library(dplyr)
library(tidyr)
library(openxlsx)
library(ggplot2)
library(forecast)


data <- read.xlsx("CAN_InputData_03.12.2024.xlsx", sheet = "input data")
head(data)



# Data from Bunle (1921-1925)...................................................
data %>%
  filter(Source == "Bunle" & Year < 1926) %>%   # Data from 1926 are in the Year Books
  mutate(Singletons = Total_deliveries - Multiple_deliveries,
         Twin_deliveries = Multiple_deliveries - Triplet_deliveries - Quadruplet_plus_deliveries,
         Twinning_rate = (Twin_deliveries / Total_deliveries) * 1000,
         Multiple_rate = (Multiple_deliveries / Total_deliveries) * 1000) -> data_Bunle
                             
     


# Data from the Year Books of the Dominion Bureau of Statistics (1926-1965).....
data %>%
  filter(Source == "BureauStats") %>%   
  mutate(Multiple_deliveries = Twin_deliveries + Triplet_deliveries + Quadruplet_plus_deliveries,
         Multiple_children = Twin_children + Triplet_children + Quadruplet_plus_children,
         Total_deliveries = ifelse(Year <= 1940, Singletons + Multiple_deliveries, Total_deliveries),
         Twinning_rate = (Twin_deliveries / Total_deliveries) * 1000,
         Multiple_rate = (Multiple_deliveries / Total_deliveries) * 1000) -> data_BureauStats



# Data from the articles by Imaizumi (1972-1990)................................
data %>% filter(Source == "Imaizumi") -> data_Imaizumi
  



# Data from Statistics Canada: online data (1991-2023)..........................

unknown <- read.xlsx("CAN_InputData_03.12.2024.xlsx", sheet = "supplement",  # rows should be corrected with new updates
                     rows = 3:36, cols = 1:2)
names(unknown)[2] <- "Unknown"


data %>% filter(Source == "StatCan") -> data_StatCan

data_StatCan <- left_join(data_StatCan, unknown)
head(data_StatCan)


data_StatCan %>%
  mutate(Twin_deliveries = Twin_children / 2,
         Triplet_deliveries = Triplet_children / 3,
         Quadruplet_plus_deliveries = Quadruplet_plus_children / 4,
         Multiple_deliveries = Twin_deliveries + Triplet_deliveries + Quadruplet_plus_deliveries,
         Multiple_children = Twin_children + Triplet_children + Quadruplet_plus_children,
         Total_deliveries = Singletons + Multiple_deliveries + Unknown,
         Twinning_rate = (Twin_deliveries / Total_deliveries) * 1000,
         Multiple_rate = (Multiple_deliveries / Total_deliveries) * 1000) -> data_StatCan

data_StatCan %>% select(-Unknown) -> data_StatCan


# Compile data and estimates from all sources................................... 
data <- rbind(data_Bunle, data_BureauStats, data_Imaizumi, data_StatCan)
data %>% arrange(Year) -> data


# Identify outliers.............................................................
outliers_tr <- tsoutliers(data$Twinning_rate)
outliers_mr <- tsoutliers(data$Multiple_rate)

data %>% 
  select(Source, Year, Twinning_rate, Multiple_rate) %>%
  mutate(outlier = ifelse(row_number() %in% outliers_tr$index |
                            row_number() %in% outliers_mr$index,
                          1, 0)) -> check

subset(check, outlier == 1)

# Save data.....................................................................
write.table(data, 
            "D:/NextCloud/HMBD_partage/Pooled database/CAN_ALLDATA.txt",
            row.names = F)


