mrd
/
DIB_COVID19_paper
mirror of https://github.com/mribeirodantas/DIB_COVID19_paper


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
            library(readr)
library(dplyr)
library(tidyr)
library(WriteXLS)

# Read all raw datasets from UN into one dataframe ---------

df <- list.files(path='data/raw/UN Data/', full.names = TRUE) %>%
  lapply(read_delim, delim = ',', skip = 1) %>%
  bind_rows

# Keep the variables required to create the data dictionary

df %>%
  select(Series, Year, Source) -> df

# Keep only latest data for each variable
df %>%
  group_by(Series) %>%
  filter(Year == max(Year)) %>%
  ungroup() -> df

# Variables are repeated for they occur for every country. Let's remove the
# duplicates.
df %>%
  distinct(Series, .keep_all = TRUE) -> df

# Fix names according to preprocessed dataset
df %>%
  # Make them all lowercase
  mutate(Series = tolower(Series)) %>%
  # Replace special chars
  mutate(Series = gsub(' ', '_', Series)) %>%
  mutate(Series = gsub('-', '_', Series)) %>%
  mutate(Series = gsub('_+', '_', Series)) %>%
  mutate(Series = gsub(',', '', Series)) -> df

# Add _year to the end of variable name, just like in the preprocessed file
df %>%
  unite("Series", Series:Year, remove=FALSE) -> df

# Add engineering UN variables
df <- rbind(df, c('whos_major_trade_partner_exp_1', 2018,
                  paste0('United Nations Statistics Division, New York, ',
                         'Commodity Trade Statistics Database (UN COMTRADE), ',
                         'last accessed May 2019.')))

# Add COVID-19 and engineering columns
df <- rbind(df,
            c('country_code', '2020', paste0('European Centre for Disease ',
                                             'Prevention and Control. Last ',
                                             'accessed 28 April, 2020.')),
            c('country_name', '2020', paste0('European Centre for Disease ',
                                             'Prevention and Control. Last ',
                                             'accessed 28 April, 2020.')),
            c('date', '2020', paste0('European Centre for Disease ',
                                     'Prevention and Control. Last ',
                                     'accessed 28 April, 2020.')),
            c('new_cases', '2020', paste0('European Centre for Disease ',
                                          'Prevention and Control. Last ',
                                          'accessed 28 April, 2020.')),
            c('new_deaths', '2020',  paste0('European Centre for Disease ',
                                            'Prevention and Control. Last ',
                                            'accessed 28 April, 2020.')),
            c('pop_data_2018', '2018', paste0('European Centre for Disease ',
                                              'Prevention and Control ',
                                              'collected from World Bank. ',
                                              'Last accessed 28 April, 2020.')),
            c('acc_cases', '2020', paste0('Engineered based on data from ',
                                          'European Centre for Disease ',
                                          'Prevention and Control. Last ',
                                          'accessed 28 April, 2020.')),
            c('acc_deaths', '2020', paste0('Engineered based on data from ',
                                           'European Centre for Disease ',
                                           'Prevention and Control. Last ',
                                           'accessed 28 April, 2020.')),
            c('lethality_rate_percent', '2020', paste0('Engineered based on data from ',
                                                       'European Centre for Disease ',
                                                       'Prevention and Control. Last ',
                                                       'accessed April, 2020.')),
            c('retail_recreation', '2020', paste0('Google Community Mobility ',
                                                  'Report. Last accessed 28 ',
                                                  'April, 2020')),
            c('grocery_pharmacy', '2020', paste0('Google Community Mobility ',
                                                 'Report. Last accessed 28 ',
                                                 'April, 2020')),
            c('parks', '2020', paste0('Google Community Mobility ',
                                      'Report. Last accessed 28 ',
                                      'April, 2020')),
            c('transit_stations', '2020', paste0('Google Community Mobility ',
                                                 'Report. Last accessed 28 ',
                                                 'April, 2020')),
            c('workplaces', '2020', paste0('Google Community Mobility ',
                                           'Report. Last accessed 28 ',
                                           'April, 2020')),
            c('residential', '2020', paste0('Google Community Mobility ',
                                            'Report. Last accessed 28 ',
                                            'April, 2020')),
            c('first_case_date', '2020',
              paste0('https://en.wikipedia.org/w/index.php?title=2019%E2%80%93',
                     '20_coronavirus_pandemic_by_country_and_territory&oldid=9',
                     '53662872 Last accessed April 28, 2020')),
            c('n_days_since_1st_case', '2020',
              paste0('Engineering from ECDC and https://en.wikipedia.org/w/index.php?title=2019%E2%80%93',
                     '20_coronavirus_pandemic_by_country_and_territory&oldid=9',
                     '53662872 Last accessed April 28, 2020')),
            c('first_death_date', '2020', paste0('Engineering based on ',
                                                 'data from ECDC ',
                                                 'counting the first death ',
                                                 'after February, 15th.')),
            c('n_days_since_1st_death', '2020', paste0('Engineering based on ',
                                                       'data from ECDC ',
                                                       'counting the first death ',
                                                       'after February, 15th.'))
            )
colnames(df) <- c('Variable name', 'Year', 'Source')

# Add description for some variables
df$Description <- NULL


WriteXLS(x = df, ExcelFileName = 'data_dictionary.xls',
         SheetNames = 'Data Dictionary')