Browse Source

Updates raw datasets 26 Aug 2020

Marcel Ribeiro-Dantas 2 months ago
parent
commit
2ddb4736ce
40 changed files with 46 additions and 183 deletions
  1. 0
    4
      data/raw/UN Data/SYB58_35_Index of industrial production.csv.dvc
  2. 0
    4
      data/raw/UN Data/SYB60_312_Carbon Dioxide Emission Estimates.csv.dvc
  3. 0
    4
      data/raw/UN Data/SYB61_12_Agricultural Production Indices.csv.dvc
  4. 0
    4
      data/raw/UN Data/SYB61_253_Population Growth Rates in Urban areas and Capital cities.csv.dvc
  5. 0
    4
      data/raw/UN Data/SYB62_123_201907_Total Imports, Exports and Balance of Trade.csv.dvc
  6. 0
    4
      data/raw/UN Data/SYB62_125_201907_Balance of Payments.csv.dvc
  7. 0
    4
      data/raw/UN Data/SYB62_128_201907_Consumer Price Index.csv.dvc
  8. 0
    4
      data/raw/UN Data/SYB62_130_201907_Exchange Rates.csv.dvc
  9. 0
    4
      data/raw/UN Data/SYB62_145_201904_Land.csv.dvc
  10. 0
    4
      data/raw/UN Data/SYB62_153_201906_Gross Value Added by Economic Activity.csv.dvc
  11. 0
    4
      data/raw/UN Data/SYB62_154_201906_Health Personnel.csv.dvc
  12. 0
    4
      data/raw/UN Data/SYB62_1_201907_Population, Surface Area and Density.csv.dvc
  13. 0
    4
      data/raw/UN Data/SYB62_200_201905_Employment.csv.dvc
  14. 0
    4
      data/raw/UN Data/SYB62_223_201907_Net Disbursements from Official ODA from Donors.csv.dvc
  15. 0
    4
      data/raw/UN Data/SYB62_226_201907_Net Disbursements from Official ODA to Recipients.csv.dvc
  16. 0
    4
      data/raw/UN Data/SYB62_230_201904_GDP and GDP Per Capita.csv.dvc
  17. 0
    4
      data/raw/UN Data/SYB62_245_201905_Public Expenditure on Education.csv.dvc
  18. 0
    4
      data/raw/UN Data/SYB62_246_201907_Population Growth, Fertility and Mortality Indicators.csv.dvc
  19. 0
    4
      data/raw/UN Data/SYB62_263_201904_Production, Trade and Supply of Energy.csv.dvc
  20. 0
    4
      data/raw/UN Data/SYB62_264_201902_Patents.csv.dvc
  21. 0
    4
      data/raw/UN Data/SYB62_285_201904_Research and Development Staff.csv.dvc
  22. 0
    4
      data/raw/UN Data/SYB62_286_201904_GDP on R&D.csv.dvc
  23. 0
    4
      data/raw/UN Data/SYB62_309_201906_Education.csv.dvc
  24. 0
    4
      data/raw/UN Data/SYB62_313_201907_Threatened Species.csv.dvc
  25. 0
    4
      data/raw/UN Data/SYB62_314_201904_Internet Usage.csv.dvc
  26. 0
    4
      data/raw/UN Data/SYB62_315_201906_Water and Sanitation Services.csv.dvc
  27. 0
    4
      data/raw/UN Data/SYB62_317_201905_Seats held by Women in Parliament.csv.dvc
  28. 0
    4
      data/raw/UN Data/SYB62_319_201906_Ratio of Girls to Boys in Education.csv.dvc
  29. 0
    4
      data/raw/UN Data/SYB62_323_201906_Teaching Staff in Education.csv.dvc
  30. 0
    4
      data/raw/UN Data/SYB62_327_201907_International Migrants and Refugees.csv.dvc
  31. 0
    4
      data/raw/UN Data/SYB62_328_201904_Intentional Homicides and Other Crimes.csv.dvc
  32. 0
    4
      data/raw/UN Data/SYB62_329_201904_Labour Force and Unemployment.csv.dvc
  33. 0
    4
      data/raw/UN Data/SYB62_330_201907_Major Trading Partners.csv.dvc
  34. 0
    4
      data/raw/UN Data/SYB63_176_202003_Tourist-Visitors Arrival and Expenditure.csv.dvc
  35. 0
    4
      data/raw/UN Data/SYB63_325_202003_Expenditure on Health.csv.dvc
  36. 1
    4
      generate_single_UN_dataset.dvc
  37. 6
    9
      preprocess.dvc
  38. 13
    12
      scripts/check_update.R
  39. 21
    7
      scripts/preprocess.R
  40. 5
    11
      update_datasets.dvc

+ 0
- 4
data/raw/UN Data/SYB58_35_Index of industrial production.csv.dvc

@@ -1,7 +1,3 @@
-md5: 79cbc034b8b28faa09bbfccc16e71c41
 outs:
 - md5: 9c9d964d9036ceea96f864cd655e7e41
   path: SYB58_35_Index of industrial production.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB60_312_Carbon Dioxide Emission Estimates.csv.dvc

@@ -1,7 +1,3 @@
-md5: 715dba65d5677d7f4686ab7e4584befc
 outs:
 - md5: ba4b5812da8a9700308bc21924225e50
   path: SYB60_312_Carbon Dioxide Emission Estimates.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB61_12_Agricultural Production Indices.csv.dvc

@@ -1,7 +1,3 @@
-md5: a3bf3e277bff12f3615c0a13d363bc98
 outs:
 - md5: ee78b455c64667c8406810602877bfbe
   path: SYB61_12_Agricultural Production Indices.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB61_253_Population Growth Rates in Urban areas and Capital cities.csv.dvc

@@ -1,7 +1,3 @@
-md5: 7aa984393f5c3bb226805cb9653f64af
 outs:
 - md5: 62a066fa45d106eccdf36e236298c028
   path: SYB61_253_Population Growth Rates in Urban areas and Capital cities.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_123_201907_Total Imports, Exports and Balance of Trade.csv.dvc

@@ -1,7 +1,3 @@
-md5: 30839962a05d9a0da5099d86ee28b249
 outs:
 - md5: 02c9b22534096171365618e7543220bb
   path: SYB62_123_201907_Total Imports, Exports and Balance of Trade.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_125_201907_Balance of Payments.csv.dvc

@@ -1,7 +1,3 @@
-md5: dbae073c33bc309bf72f53371841869a
 outs:
 - md5: e12af62d34294ea0361858f6d54270ff
   path: SYB62_125_201907_Balance of Payments.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_128_201907_Consumer Price Index.csv.dvc

@@ -1,7 +1,3 @@
-md5: 5ba5b2ba7645a3e684059231d6d69243
 outs:
 - md5: 5b8d619ec7fe1bd845a74a0fb99745ef
   path: SYB62_128_201907_Consumer Price Index.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_130_201907_Exchange Rates.csv.dvc

@@ -1,7 +1,3 @@
-md5: a0cb1d87cf1f744e6e3a5c55bfb0c1ea
 outs:
 - md5: cfc1991fad2187db027cafe59d113d7a
   path: SYB62_130_201907_Exchange Rates.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_145_201904_Land.csv.dvc

@@ -1,7 +1,3 @@
-md5: 5d492a619cbb334059e9f3c076f4dd48
 outs:
 - md5: e8df602fccc6523fa7d3256c61d5914e
   path: SYB62_145_201904_Land.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_153_201906_Gross Value Added by Economic Activity.csv.dvc

@@ -1,7 +1,3 @@
-md5: b64f1e229bc3a77bb956e39ae3c9fd44
 outs:
 - md5: 5874fff5667658c8e322a7ab6ca1fcf0
   path: SYB62_153_201906_Gross Value Added by Economic Activity.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_154_201906_Health Personnel.csv.dvc

@@ -1,7 +1,3 @@
-md5: 4301e366bc67a007bcac8f1423839dc0
 outs:
 - md5: f200bbacb1708bb6e7d61865af7e5e0c
   path: SYB62_154_201906_Health Personnel.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_1_201907_Population, Surface Area and Density.csv.dvc

@@ -1,7 +1,3 @@
-md5: e440460224e8ecb7a5326ce40f4a7f3b
 outs:
 - md5: a730e93da1ac779277f4530ddf948738
   path: SYB62_1_201907_Population, Surface Area and Density.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_200_201905_Employment.csv.dvc

@@ -1,7 +1,3 @@
-md5: f26190c5f15db1154a3bfa26fdc6128d
 outs:
 - md5: 5d0275fa6ce8ee83f5083ab190653821
   path: SYB62_200_201905_Employment.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_223_201907_Net Disbursements from Official ODA from Donors.csv.dvc

@@ -1,7 +1,3 @@
-md5: d7765ca1da9c39123e31671369d6bdc3
 outs:
 - md5: f7868cc3e29e34be8d23741663b314b8
   path: SYB62_223_201907_Net Disbursements from Official ODA from Donors.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_226_201907_Net Disbursements from Official ODA to Recipients.csv.dvc

@@ -1,7 +1,3 @@
-md5: 85d5609485eacfd46aedc6b1d1dc717a
 outs:
 - md5: 11517ac7e5e28383062b319f7f207df0
   path: SYB62_226_201907_Net Disbursements from Official ODA to Recipients.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_230_201904_GDP and GDP Per Capita.csv.dvc

@@ -1,7 +1,3 @@
-md5: 3b1054aaad3eb8c9b400bba5de95350f
 outs:
 - md5: 48da8b4d014f49770a81534df5f1d9cc
   path: SYB62_230_201904_GDP and GDP Per Capita.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_245_201905_Public Expenditure on Education.csv.dvc

@@ -1,7 +1,3 @@
-md5: 02456d415467ca90b628627be45256e3
 outs:
 - md5: 41e6d2f1e0c44d5479e19164171a3da0
   path: SYB62_245_201905_Public Expenditure on Education.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_246_201907_Population Growth, Fertility and Mortality Indicators.csv.dvc

@@ -1,7 +1,3 @@
-md5: 85eb5966c5c9b87a916c69fd993433dd
 outs:
 - md5: b9176a587c18fbc9084c67d4bc440bdb
   path: SYB62_246_201907_Population Growth, Fertility and Mortality Indicators.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_263_201904_Production, Trade and Supply of Energy.csv.dvc

@@ -1,7 +1,3 @@
-md5: f8ad1b32aa5889835a3bdd1650257bea
 outs:
 - md5: 3f5b40e0c5c52f83fbfcca51013ddb1f
   path: SYB62_263_201904_Production, Trade and Supply of Energy.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_264_201902_Patents.csv.dvc

@@ -1,7 +1,3 @@
-md5: 16fbdb8d0b3a92d7a57c152714006c79
 outs:
 - md5: e37b38fe8e1c514c90b87c5dec25c81a
   path: SYB62_264_201902_Patents.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_285_201904_Research and Development Staff.csv.dvc

@@ -1,7 +1,3 @@
-md5: 7e30256ca248fe94c7d1e0597fa36fe0
 outs:
 - md5: d1e747b2340b9aa65404a0062024be87
   path: SYB62_285_201904_Research and Development Staff.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_286_201904_GDP on R&D.csv.dvc

@@ -1,7 +1,3 @@
-md5: 0155b63f043fd3b6fa8ffcea8f34820d
 outs:
 - md5: 4298efc9d34e2031ab78355ecf332d2e
   path: SYB62_286_201904_GDP on R&D.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_309_201906_Education.csv.dvc

@@ -1,7 +1,3 @@
-md5: 7f551ae64fb70ebd215da7f719f9ef51
 outs:
 - md5: 99e2f88a12e7f56ce0ecbabbf72e21c2
   path: SYB62_309_201906_Education.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_313_201907_Threatened Species.csv.dvc

@@ -1,7 +1,3 @@
-md5: b5b9e0058550cddfbfd7d2cc2ff4e087
 outs:
 - md5: fd75fefd718fc9e2a1d80bb5d84fcde4
   path: SYB62_313_201907_Threatened Species.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_314_201904_Internet Usage.csv.dvc

@@ -1,7 +1,3 @@
-md5: d699f0fa75292458ebacc281a7090ca9
 outs:
 - md5: 810a4fdcc6150600723f76ffed960720
   path: SYB62_314_201904_Internet Usage.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_315_201906_Water and Sanitation Services.csv.dvc

@@ -1,7 +1,3 @@
-md5: e9317ba0bd9a0f120d9184ca51f8d431
 outs:
 - md5: c5958fbe4bb6f9dee31ce3f0aa9dcf04
   path: SYB62_315_201906_Water and Sanitation Services.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_317_201905_Seats held by Women in Parliament.csv.dvc

@@ -1,7 +1,3 @@
-md5: 7d31cbc62c9aad6da8728302f6af05e2
 outs:
 - md5: 62ea227bc1cad2f03dbe725d03cd0c3a
   path: SYB62_317_201905_Seats held by Women in Parliament.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_319_201906_Ratio of Girls to Boys in Education.csv.dvc

@@ -1,7 +1,3 @@
-md5: ae071e60c72e1e173613ace508e114f3
 outs:
 - md5: 20a7e7b9b64b62d0707cc913ba144d47
   path: SYB62_319_201906_Ratio of Girls to Boys in Education.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_323_201906_Teaching Staff in Education.csv.dvc

@@ -1,7 +1,3 @@
-md5: add7b7a08fd46399f0713127f3f8798c
 outs:
 - md5: 2507d9006397678f2fc4f688127ed226
   path: SYB62_323_201906_Teaching Staff in Education.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_327_201907_International Migrants and Refugees.csv.dvc

@@ -1,7 +1,3 @@
-md5: 7fb6a4222572939d3cc46ca3489abd4b
 outs:
 - md5: 49d3f49f984f61b31de4ed7dd114bd76
   path: SYB62_327_201907_International Migrants and Refugees.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_328_201904_Intentional Homicides and Other Crimes.csv.dvc

@@ -1,7 +1,3 @@
-md5: 03f4a8d098242db80f6662b62e2e66bc
 outs:
 - md5: 96c22545d7bd0e12a036018e1df7d3dd
   path: SYB62_328_201904_Intentional Homicides and Other Crimes.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_329_201904_Labour Force and Unemployment.csv.dvc

@@ -1,7 +1,3 @@
-md5: e02cc7c46d335cab858b992d926f30e9
 outs:
 - md5: 753393f93d3b6e5d3fe940a302770e07
   path: SYB62_329_201904_Labour Force and Unemployment.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB62_330_201907_Major Trading Partners.csv.dvc

@@ -1,7 +1,3 @@
-md5: fd419ace32adfa2a3a218f950fff92b7
 outs:
 - md5: fb55792a13cc53107831b747638e3395
   path: SYB62_330_201907_Major Trading Partners.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB63_176_202003_Tourist-Visitors Arrival and Expenditure.csv.dvc

@@ -1,7 +1,3 @@
-md5: cea977e626da224e7adb5fefad4490ec
 outs:
 - md5: 8f52e307d5ec77056bf7ca6d0755fe94
   path: SYB63_176_202003_Tourist-Visitors Arrival and Expenditure.csv
-  cache: true
-  metric: false
-  persist: false

+ 0
- 4
data/raw/UN Data/SYB63_325_202003_Expenditure on Health.csv.dvc

@@ -1,7 +1,3 @@
-md5: 2c664492cfa2323923625687685e7d77
 outs:
 - md5: 510656f91274af57cc3851215d342d2c
   path: SYB63_325_202003_Expenditure on Health.csv
-  cache: true
-  metric: false
-  persist: false

+ 1
- 4
generate_single_UN_dataset.dvc

@@ -1,4 +1,4 @@
-md5: 1ce4b5cd5bdd4d81b98594788530a329
+md5: 059db79a0bec6a89f692326b4a7184bf
 cmd: Rscript scripts/aggregate_UN_data.R
 deps:
 - md5: 9c9d964d9036ceea96f864cd655e7e41
@@ -79,6 +79,3 @@ deps:
 outs:
 - md5: ac537d89c981b02641203d606123c78a
   path: data/raw/UN_dataset.tsv
-  cache: true
-  metric: false
-  persist: false

+ 6
- 9
preprocess.dvc

@@ -1,19 +1,16 @@
-md5: 9735989952cd30a4a68ca035db313c5c
+md5: 6d3a26ca07f1c81fac7288b27ef4edd0
 cmd: Rscript scripts/preprocess.R
 deps:
-- md5: bde10be2c1f175675960a08d0c759368
+- md5: 47d4e2dd5a724d0a91d098ad628a11d6
   path: data/raw/COVID19_worldwide_raw.csv
-- md5: 277567cf32ccb37284189b354e22b455
+- md5: bf0d393e8adb1a89b3388d227c3ac43e
   path: data/raw/Global_Mobility_Report.csv
 - md5: ac537d89c981b02641203d606123c78a
   path: data/raw/UN_dataset.tsv
-- md5: e6f4cb1116cd333d6982535a612e91a8
+- md5: 5948b29125eaf5ea963609226d93160c
   path: data/raw/hk-reunion-covid-19.csv
-- md5: bb1d9c16b486c38d9d49cac06fcef525
+- md5: 048c63bd32f360556164a62a39deb040
   path: scripts/preprocess.R
 outs:
-- md5: 47cfb91a431f79c4a0e134fe81f95200
+- md5: 8c28ae4f7557233d20004dbc14434c63
   path: data/preprocessed/DIB_dataset.tsv
-  cache: true
-  metric: false
-  persist: false

+ 13
- 12
scripts/check_update.R

@@ -30,6 +30,7 @@ latest_GMR_dataset <- read_csv(file = GMR_url,
                                                      country_region= 'c',
                                                      sub_region_1 = 'c',
                                                      sub_region_2 = 'c',
+                                                     metro_area = 'c',
                                                      date = 'D',
                                                      retail_and_recreation_percent_change_from_baseline = 'd',
                                                      grocery_and_pharmacy_percent_change_from_baseline = 'd',
@@ -158,15 +159,15 @@ if (max(latest_JHU_dataset$date) > last_date) {
 
 # Update remote / git repo ------------------------------------------------
 
-if (repro == TRUE) {
-  # system('dvc repro preprocess.dvc')
-  system(paste0('git add data/raw/COVID19_worldwide_raw.csv.dvc preprocess.dvc',
-                ' data/raw/hk-reunion-covid-19.csv.dvc data/raw/Global_Mobilit',
-                'y_Report.csv.dvc'))
-  commit_msg <- paste0('Updates raw datasets \'', today(), '\'')
-  system(paste0('git commit -m \"', commit_msg, '\"'))
-  system('git push')
-} else {
-  print('Everything is up to date. Nothing else to do.')
-}
-
+#if (repro == TRUE) {
+#  system('dvc repro preprocess.dvc')
+#  system(paste0('git add data/raw/COVID19_worldwide_raw.csv.dvc preprocess.dvc',
+#                ' data/raw/hk-reunion-covid-19.csv.dvc data/raw/Global_Mobilit',
+#                'y_Report.csv.dvc'))
+#  commit_msg <- paste0('Updates raw datasets \'', today(), '\'')
+#  system(paste0('git commit -m \"', commit_msg, '\"'))
+#  system('git push')
+#} else {
+#  print('Everything is up to date. Nothing else to do.')
+#}
+##

+ 21
- 7
scripts/preprocess.R

@@ -10,26 +10,32 @@ library(lubridate)
 # Google Mobility Report (GMR) dataset
 
 raw_dataset <- read_csv(file = 'data/raw/Global_Mobility_Report.csv',
-                        col_types = paste(c(rep('c', 4),
+                        col_types = paste(c(rep('c', 7),
                                                 'D',
                                                 rep('d', 6)),
                                               collapse=''))
 
 colnames(raw_dataset) <- c('locality_code', 'locality_name', 'region_name',
-                           'county_name', 'date', 'retail_recreation',
+                           'county_name', 'metro_area', 'iso_3166_2',
+                           'census_fips', 'date', 'retail_recreation',
                            'grocery_pharmacy', 'parks', 'transit_stations',
                            'workplaces', 'residential')
+# Remove new columns
+raw_dataset <- raw_dataset %>%
+  select(-c('iso_3166_2', 'census_fips'))
 
 # COVID19 dataset from ECDC
 
 covid <- read_delim(file = 'data/raw/COVID19_worldwide_raw.csv', na = '',
                     col_types = cols('c', 'i', 'i', 'i', 'i', 'i', 'c', 'c',
-                                     'c', 'i', 'c'),
+                                     'c', 'i', 'c', 'c'),
                     delim = ',')
 
 colnames(covid) <- c('date', 'day', 'month', 'year', 'new_cases', 'new_deaths',
                      'locality_name', 'country_id', 'territory_id',
-                     'pop_data_2018', 'continent')
+                     'pop_data_2018', 'continent', 'cumulative')
+covid <- covid %>%
+  select(-c('cumulative'))
 
 # COVID-19 data for Réunion and Hong Kong  that are missing in the ECDC dataset
 # Source: John Hopkins University https://github.com/CSSEGISandData/COVID-19
@@ -58,10 +64,15 @@ country_details <- read_delim(file = 'data/raw/UN_dataset.tsv', delim = '\t',
 ####
 
 # Create a long table from the wide original version
-preprocessed_dataset <- pivot_longer(raw_dataset, cols=6:11, names_to = 'plot_name',
+preprocessed_dataset <- pivot_longer(raw_dataset, cols=7:12, names_to = 'plot_name',
                          values_to = 'variation')
 rm(raw_dataset)
 
+# Remove rows on metrpolitan area
+preprocessed_dataset <- preprocessed_dataset %>%
+  filter(is.na(metro_area)) %>%
+  select(-c(metro_area))
+
 ####
 #
 # Working on COVID19 dataset
@@ -224,7 +235,10 @@ preprocessed_dataset %>%
 
 # Bring plot_names from row to column
 preprocessed_dataset %>%
-  pivot_wider(names_from = plot_name, values_from = variation) -> preprocessed_dataset
+group_by(plot_name) %>%
+  mutate(row = row_number()) %>%
+  tidyr::pivot_wider(names_from = plot_name, values_from = variation) %>%
+  select(-row) -> preprocessed_dataset
 
 # Add n_days_since_1st_case column
 preprocessed_dataset %>%
@@ -332,7 +346,7 @@ id = which(colnames(preprocessed_dataset) == paste0('labour_force_participatio',
                                                     'n_male_2019'))
 colnames(preprocessed_dataset)[id] <- paste0('labour_force_participation_rate_',
                                              'male_2019')
-
+rm(id)
 
 # Saving final preprocessed dataset ---------------------------------------
 

+ 5
- 11
update_datasets.dvc

@@ -1,22 +1,16 @@
-md5: 9ae12bfc5bc4453767fb935c739d606d
+md5: 1daafa289f2b989c69e0f9375a666167
 cmd: Rscript scripts/check_update.R
 deps:
-- md5: 6523b7116c21580b99f40095a6685726
+- md5: 306f64fac5df431d8cd661b9d5b1890d
   path: scripts/check_update.R
 outs:
-- md5: 277567cf32ccb37284189b354e22b455
+- md5: bf0d393e8adb1a89b3388d227c3ac43e
   path: data/raw/Global_Mobility_Report.csv
-  cache: true
-  metric: false
   persist: true
-- md5: bde10be2c1f175675960a08d0c759368
+- md5: 47d4e2dd5a724d0a91d098ad628a11d6
   path: data/raw/COVID19_worldwide_raw.csv
-  cache: true
-  metric: false
   persist: true
-- md5: e6f4cb1116cd333d6982535a612e91a8
+- md5: 5948b29125eaf5ea963609226d93160c
   path: data/raw/hk-reunion-covid-19.csv
-  cache: true
-  metric: false
   persist: true
 always_changed: true