setwd("D:/PROJET JUMEAUX INED-MNHM/DATABASE/RAW DATA & METADATA/ESP")

# ******************************************************************************
#                                  SPAIN
#
# This file provides the calculations performed on the input data for 
# Spain, as used in the construction of the Human Multiple Births Database
#
# ******************************************************************************


library(dplyr)
library(tidyr)
library(openxlsx)
library(ggplot2)
library(forecast)



data <- read.xlsx("ESP_InputData_18.07.2023.xlsx", sheet = "input data")
head(data)


data %>%
  filter(Year >= 1901) %>%
  rowwise()%>% 
  mutate(Multiple_deliveries = sum(Twin_deliveries, Triplet_deliveries, 
                                   Quadruplet_plus_deliveries, na.rm = T),
         Multiple_children = ifelse(is.na(Quadruplet_plus_children),
                                    sum(Twin_deliveries*2, Triplet_deliveries*3,
                                        Quadruplet_plus_deliveries*4, na.rm = T),
                                    sum(Twin_deliveries*2, Triplet_deliveries*3,
                                        Quadruplet_plus_children, na.rm = T))) %>%
  ungroup() %>%
  as.data.frame() %>%
  mutate(Total_deliveries = ifelse(is.na(Total_deliveries),
                                   Singletons + Multiple_deliveries,
                                   Total_deliveries),
         Twinning_rate = (Twin_deliveries / Total_deliveries) * 1000,
         Multiple_rate = (Multiple_deliveries / Total_deliveries) * 1000) -> data





# Check discrepancies ...........................................................
head(data)

data %>%
  mutate(check1 = round(Total_deliveries - Singletons - Multiple_deliveries, 2),
         check2 = round(Total_children - Singletons - Multiple_children, 2)) %>%
  rowwise() %>%
  mutate(check3 = round(Multiple_deliveries - sum(Twin_deliveries,
                                                  Triplet_deliveries,
                                                  Quadruplet_plus_deliveries,
                                                  na.rm = T))) %>%
  ungroup() %>%
  as.data.frame() %>%
  filter(check1 != 0 | check2 != 0 | check3 != 0) -> check

rm(check)




# Identify outliers.............................................................
ggplot(data = data) +
  geom_point(aes(x = Year, y = Twinning_rate, shape = as.factor(Stillbirths))) +
  geom_point(aes(x = Year, y = Multiple_rate, shape = as.factor(Stillbirths)), colour = "red")


outliers_tr <- tsoutliers(data$Twinning_rate)
outliers_mr <- tsoutliers(data$Multiple_rate)


data %>% 
  select(Source, Year, Twinning_rate, Multiple_rate) %>%
  mutate(outlier = ifelse(row_number() %in% outliers_tr$index |
                            row_number() %in% outliers_mr$index,
                          1, 0)) -> check

subset(check, outlier == 1) # rates for the years 1982, 2014 and 2016 are 
                            # identified as outliers.  
rm(check, outliers_tr, outliers_mr)



# Save data.....................................................................
write.table(data, 
            "D:/PROJET JUMEAUX INED-MNHM/DATABASE/ESTIMATES/ESP_ALLDATA.txt",
            row.names = F)




  


