Browse Source

Updates preprocess pipeline with several changes

1. Fixes the right number of days since first case column for countries that
   had first case before the Feb 11
2. Adds number of days since first death column (from Feb 11)
3. Adds info about DVC remote (once again donated by DAGsHub)
4. Fixes duplicated rows for US from different UN datasets
Marcel Ribeiro-Dantas 6 months ago
parent
commit
3613d3cd6f
3 changed files with 59 additions and 12 deletions
  1. 4
    0
      .dvc/config
  2. 3
    3
      preprocess.dvc
  3. 52
    9
      scripts/preprocess.R

+ 4
- 0
.dvc/config

@@ -0,0 +1,4 @@
+[core]
+    remote = s3remote
+['remote "s3remote"']
+    url = s3://dagshub-covid19-gmr-public

+ 3
- 3
preprocess.dvc

@@ -1,4 +1,4 @@
-md5: f071416f134f289e486986d12ab29390
+md5: 022a56239fda37856a8f555b46ca01e7
 cmd: Rscript scripts/preprocess.R
 deps:
 - md5: 6c379663426e8135df92af20075e33b4
@@ -7,10 +7,10 @@ deps:
   path: data/raw/Global_Mobility_Report.csv
 - md5: ac537d89c981b02641203d606123c78a
   path: data/raw/UN_dataset.tsv
-- md5: a82b39358ca5f7a37941b60a604c745c
+- md5: c0c64a88cb4311be38196a365e927b44
   path: scripts/preprocess.R
 outs:
-- md5: bcc995433b8502d4c0c1dd761f313786
+- md5: c97cc30cf2e11c445e64a5bfc4a146ee
   path: data/preprocessed/DIB_dataset.tsv
   cache: true
   metric: false

+ 52
- 9
scripts/preprocess.R

@@ -32,9 +32,6 @@ colnames(covid) <- c('date', 'day', 'month', 'year', 'new_cases', 'new_deaths',
                      'pop_data_2018')
 
 # Country details from UN Data
-# There were ~ in 0 or 0.0 numbers in the raw data file. I had to manually
-# replace ~0 and ~0.0 by 0 and 0.0, otherwise R wouldn't understand this is
-# a number.
 country_details <- read_delim(file = 'data/raw/UN_dataset.tsv', delim = '\t',
                             col_types = paste(c('c',
                                                 rep('d', 173),
@@ -134,11 +131,11 @@ preprocessed_dataset %>%
 
 # Before merging to get more info about the countries, we must make sure all
 # country names are the same.
-# unique(preprocessed_dataset$country_name)[which(
-#      unique(preprocessed_dataset$country_name) %in%
-#        unique(country_details$country) == FALSE
-#    )
-#  ]
+#unique(preprocessed_dataset$country_name)[which(
+#     unique(preprocessed_dataset$country_name) %in%
+#       unique(country_details$region_name) == FALSE
+#   )
+# ]
 
 country_details %>%
   mutate(region_name = case_when(
@@ -157,6 +154,26 @@ country_details %>%
     TRUE ~ region_name)
   ) -> country_details
 
+# In some datastes US appears as United States, and in others as United States
+# of America. The naming was fixed earlier, but we have two rows for US. Fix.
+# ids <- which(country_details$region_name == 'United States')
+
+country_details[88,][,17:42] <- country_details[217,][,17:42]
+country_details[88,][,45:51] <- country_details[217,][,45:51]
+country_details[88,][,53:55] <- country_details[217,][,53:55]
+country_details[88,][,63:72] <- country_details[217,][,63:72]
+country_details[88,][,77:80] <- country_details[217,][,77:80]
+country_details[88,][,90:93] <- country_details[217,][,90:93]
+country_details[88,][,97:104] <- country_details[217,][,97:104]
+country_details[88,][,112:114] <- country_details[217,][,112:114]
+country_details[88,][,116:118] <- country_details[217,][,116:118]
+country_details[88,][,128:133] <- country_details[217,][,128:133]
+country_details[88,][,143:148] <- country_details[217,][,143:148]
+country_details[88,][,150] <- country_details[217,][,150]
+country_details[88,][,152:157] <- country_details[217,][,152:157]
+country_details[88,][,159:175] <- country_details[217,][,159:175]
+country_details <- country_details[-217, ]
+
 ####
 #
 # Merge country details and preprocessed_dataset
@@ -198,7 +215,7 @@ preprocessed_dataset %>%
 preprocessed_dataset %>%
   pivot_wider(names_from = plot_name, values_from = variation) -> preprocessed_dataset
 
-# Add epidemiological week to column
+# Add n_days_since_1st_case column
 preprocessed_dataset %>%
   group_by(country_name) %>%
   mutate(first_case_date = min(date[acc_cases > 0])) %>%
@@ -208,6 +225,16 @@ preprocessed_dataset %>%
                    0)) %>%
   ungroup() -> preprocessed_dataset
 
+# Add n_days_since_1st_death column
+preprocessed_dataset %>%
+  group_by(country_name) %>%
+  mutate(first_death_date = min(date[acc_deaths > 0])) %>%
+  mutate(n_days_since_1st_death =
+           if_else(acc_deaths > 0,
+                   as.numeric(date - min(date[acc_deaths > 0])+1),
+                   0)) %>%
+  ungroup() -> preprocessed_dataset
+
 # Set manually first case for countries whose first case happened before Feb 15
 # Wikipedia contributors, "2019–20 coronavirus pandemic", Wikipedia, The Free
 # Encyclopedia,
@@ -246,6 +273,22 @@ preprocessed_dataset %>%
     )
   ) -> preprocessed_dataset
 
+# Fix n_days since 1st case for countries that had 1st case before Feb 11
+countries <- c('Thailand', 'Japan', 'South Korea', 'United States', 'Taiwan',
+               'Hong Kong', 'Singapore', 'Vietnam', 'France', 'Nepal',
+               'Australia', 'Canada', 'Malaysia', 'Cambodia', 'Germany',
+               'Sri Lanka', 'Finland', 'United Arab Emirates', 'India',
+               'Italy', 'Philippines', 'Spain', 'Sweden', 'United Kingdom',
+               'Belgium', 'Egypt')
+preprocessed_dataset %>%
+  group_by(country_name) %>%
+  mutate(n_days_since_1st_case =
+           if_else(country_name %in% countries,
+                   as.numeric(date - first_case_date)+1,
+                   n_days_since_1st_case)) %>%
+  ungroup() -> preprocessed_dataset
+rm(countries)
+
 # Saving final preprocessed dataset ---------------------------------------
 
 # Save full dataset