setwd("F:/PROJET JUMEAUX INED-MNHM/DATABASE/RAW DATA & METADATA/USA")

# ******************************************************************************
#                                UNITED STATES
#
# This file provides the calculations performed on the input data for the USA,
# as used in the construction of the Human Multiple Births Database
#
# ******************************************************************************


library(dplyr)
library(tidyr)
library(openxlsx)
library(ggplot2)
library(forecast)

  

data <- read.xlsx("USA_InputData_25.01.2023.xlsx", sheet = "input data")
head(data)

data %>% arrange(Year, Stillbirths) -> data



# Drop the series that includes the fetal deaths (Stillbirths = 1), because of
# of data quality issues (see metadata file for the USA)........................
data %>% filter(Stillbirths != 1) -> data



# Data from the Vital Statistics - unit: deliveries (1915-1958).................
data %>%
  filter(Year <= 1958 & Stillbirths == 2) %>%
  mutate(Multiple_children = ifelse(is.na(Multiple_children),
                                          Total_children - Singletons,
                                          Multiple_children),
         # For some years, the number of multiple children is available in the 
         # vital statistics reports. Those numbers are equal to the difference between
         # the total number of babies born alive and the number of singletons.
         Twinning_rate = (Twin_deliveries / Total_deliveries) * 1000,
         Multiple_rate = (Multiple_deliveries / Total_deliveries) * 1000) -> data_VSUS_deliveries



# Data from the Vital Statistics - unit: children (1959-2020)...................
# Series including only the live births (Stillbirths = 0). Remove 1969 and 1970, 
# as there is no data on births by plurality for those years  
remove.years <- c(1969, 1970)
data %>% filter(Stillbirths == 0 & !(Year %in% remove.years)) ->  data_VSUS_children



# From 1961 to 1988, information on triplets + is aggregated (i.e. no distinction 
# between triplets, quadruplets, etc.) in the sources. See if there is a trend in 
# the proportion of quadruplets + in the years with available data:

head(data_VSUS_children)
data_VSUS_children %>%
  mutate(p_4 = Quadruplet_plus_children / (Triplet_children + Quadruplet_plus_children)) %>%
  select(Year, p_4) -> check1

head(data_VSUS_deliveries)
data_VSUS_deliveries %>%
  mutate(p_4 = Quadruplet_plus_deliveries / (Triplet_deliveries + Quadruplet_plus_deliveries)) %>%
  select(Year, p_4) -> check2

check <- rbind(check1, check2)
check %>% arrange(Year) -> check

ggplot(data = check) +
  geom_point(aes(x = Year, y = p_4)) 

# Based on the graph, it is evident that the proportion of quadruplet births 
# increased during the period 1961-1988. Assume linear increase to calculate the 
# number of quadruplet births:

library(imputeTS)
check %>% mutate(p_4_lin = na_interpolation(p_4, option = "linear")) -> check

ggplot(data = check) +
  geom_point(aes(x = Year, y = p_4)) +
  geom_point(aes(x = Year, y = p_4_lin), shape = 1, colour = "red") 


head(check)
some.years <- c(1961:1988, 2021) # These are the years for which the number of 
                                 # quadruplet births needs to be disaggregated from 
                                 # the number of triplets + 
check %>%
  filter(Year %in% some.years) %>%
  select(Year, p_4_lin) -> p_quadruplets

rm(check, check1, check2)


# Add weights to data to estimate the number of quadruplets from 1961 to 1988:
data_VSUS_children <- left_join(data_VSUS_children, p_quadruplets)
rm(p_quadruplets)
head(data_VSUS_children)

data_VSUS_children %>%
  mutate(Twin_deliveries = Twin_children / 2,
         Triplet_children = ifelse(Year == 2021,
                                   Total_children - Singletons - Twin_children,
                                   # Data for 2021 available at the time of this update
                                   # are provisional. Here triplet children include also the
                                   # quadruplets and more.
                                   Triplet_children),
         Quadruplet_plus_children = ifelse(Year %in% some.years,
                                           p_4_lin * Triplet_children,
                                           Quadruplet_plus_children),
         Triplet_children2 = ifelse(Year %in% some.years,
                                    (1 - p_4_lin) * Triplet_children,
                                    Triplet_children),
         # Continue working with Triplet_children2 instead of Triplet_children,
         # as the former is an estimation of triplets only. 
         Triplet_deliveries = Triplet_children2 / 3,
         Quadruplet_plus_deliveries = Quadruplet_plus_children / 4,
         
         Multiple_children = Total_children - Singletons,
         # Separate calculations show that this gives the same results as:  
         # multiples = twins + triplets + quadruplets
         
         Multiple_deliveries = Twin_deliveries + Triplet_deliveries + Quadruplet_plus_deliveries,
         Total_deliveries = Singletons + Multiple_deliveries,
         Twinning_rate = (Twin_deliveries / Total_deliveries) * 1000,
         Multiple_rate = (Multiple_deliveries / Total_deliveries) * 1000,
         
         Stillbirths = 2,
         # Changed from Stillbirths = 0 (i.e., not included in the statistics) to 
         # Stillbirths = 2 (i.e., mixed treatment of stillbirths). See explanations
         # in the metadata file for the US, section "Definitions and treatment of 
         # births by vitality".  
         
         Footnotes = ifelse(Year %in% some.years,
                            paste0(Footnotes, "; 10"),
                            Footnotes)) %>%
  select(Country, Source, Year, Stillbirths, Singletons, Twin_deliveries, Twin_children,
         Triplet_deliveries, Triplet_children2, Quadruplet_plus_deliveries, 
         Quadruplet_plus_children, Multiple_deliveries, Multiple_children,
         Total_deliveries, Total_children, Twinning_rate, Multiple_rate, Footnotes) -> data_VSUS_children

names(data_VSUS_children)[9] <- "Triplet_children" 



# compile results ............................................................. 
rm(data)
data <- rbind(data_VSUS_deliveries, data_VSUS_children)
rm(data_VSUS_deliveries, data_VSUS_children)
data %>% arrange(Year) -> data



# Identify outliers.............................................................
ggplot(data = data) +
  geom_point(aes(x = Year, y = Twinning_rate, shape = as.factor(Stillbirths))) +
  geom_point(aes(x = Year, y = Multiple_rate, shape = as.factor(Stillbirths)), colour = "red")


outliers_tr <- tsoutliers(data$Twinning_rate)
outliers_mr <- tsoutliers(data$Multiple_rate)


data %>% 
  select(Source, Year, Twinning_rate, Multiple_rate) %>%
  mutate(outlier = ifelse(row_number() %in% outliers_tr$index |
                            row_number() %in% outliers_mr$index,
                          1, 0)) -> check

subset(check, outlier == 1) # No outliers found
rm(check, outliers_tr, outliers_mr)



# Check discrepancies ...........................................................
data %>%
  mutate(check1 = round(Total_deliveries - Singletons - Multiple_deliveries, 2),
         check2 = round(Total_children - Singletons - Multiple_children, 2)) %>%
  rowwise() %>%
  mutate(check3 = round(Multiple_deliveries - sum(Twin_deliveries,
                                                  Triplet_deliveries,
                                                  Quadruplet_plus_deliveries,
                                                  na.rm = T)),
         check4 = ifelse(Year >= 1959,
                         round(Multiple_children - 
                                 sum(Twin_children, Triplet_children,
                                     Quadruplet_plus_children, na.rm = T)),
                         NA)) %>%
  ungroup() %>%
  as.data.frame() %>%
  filter(check1 != 0 | check2 != 0 | check3 != 0 | check4 != 0) -> check

# For the years 1951, 1954, 1957 and 1958, the total number of deliveries reported 
# in the original sources is larger than the sum of singleton and multiple deliveries. 
# The original data was retained in the HMBD, as it is not possible to distinguish 
# the source of those discrepancies (i.e., whether they originate from the number 
# of singleton, multiple or total deliveries). Nevertheless, the Twinning and 
# Multiple rates are barely affected by this issue, regardless of the number used 
# in the denominator (i.e., the reported number of total deliveries or the sum of 
# singleton and multiple deliveries). 
rm(check)



# Save data.....................................................................
write.table(data, 
            "F:/PROJET JUMEAUX INED-MNHM/DATABASE/ESTIMATES/USA_ALLDATA.txt",
            row.names = F)


