Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

generate_raw_dictionary_file.R 6.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  1. library(readr)
  2. library(dplyr)
  3. library(tidyr)
  4. library(WriteXLS)
  5. # Read all raw datasets from UN into one dataframe ---------
  6. df <- list.files(path='data/raw/UN Data/', full.names = TRUE) %>%
  7. lapply(read_delim, delim = ',', skip = 1) %>%
  8. bind_rows
  9. # Keep the variables required to create the data dictionary
  10. df %>%
  11. select(Series, Year, Source) -> df
  12. # Keep only latest data for each variable
  13. df %>%
  14. group_by(Series) %>%
  15. filter(Year == max(Year)) %>%
  16. ungroup() -> df
  17. # Variables are repeated for they occur for every country. Let's remove the
  18. # duplicates.
  19. df %>%
  20. distinct(Series, .keep_all = TRUE) -> df
  21. # Fix names according to preprocessed dataset
  22. df %>%
  23. # Make them all lowercase
  24. mutate(Series = tolower(Series)) %>%
  25. # Replace special chars
  26. mutate(Series = gsub(' ', '_', Series)) %>%
  27. mutate(Series = gsub('-', '_', Series)) %>%
  28. mutate(Series = gsub('_+', '_', Series)) %>%
  29. mutate(Series = gsub(',', '', Series)) -> df
  30. # Add _year to the end of variable name, just like in the preprocessed file
  31. df %>%
  32. unite("Series", Series:Year, remove=FALSE) -> df
  33. # Add engineering UN variables
  34. df <- rbind(df, c('whos_major_trade_partner_exp_1', 2018,
  35. paste0('United Nations Statistics Division, New York, ',
  36. 'Commodity Trade Statistics Database (UN COMTRADE), ',
  37. 'last accessed May 2019.')))
  38. # Add COVID-19 and engineering columns
  39. df <- rbind(df,
  40. c('country_code', '2020', paste0('European Centre for Disease ',
  41. 'Prevention and Control. Last ',
  42. 'accessed 28 April, 2020.')),
  43. c('country_name', '2020', paste0('European Centre for Disease ',
  44. 'Prevention and Control. Last ',
  45. 'accessed 28 April, 2020.')),
  46. c('date', '2020', paste0('European Centre for Disease ',
  47. 'Prevention and Control. Last ',
  48. 'accessed 28 April, 2020.')),
  49. c('new_cases', '2020', paste0('European Centre for Disease ',
  50. 'Prevention and Control. Last ',
  51. 'accessed 28 April, 2020.')),
  52. c('new_deaths', '2020', paste0('European Centre for Disease ',
  53. 'Prevention and Control. Last ',
  54. 'accessed 28 April, 2020.')),
  55. c('pop_data_2018', '2018', paste0('European Centre for Disease ',
  56. 'Prevention and Control ',
  57. 'collected from World Bank. ',
  58. 'Last accessed 28 April, 2020.')),
  59. c('acc_cases', '2020', paste0('Engineered based on data from ',
  60. 'European Centre for Disease ',
  61. 'Prevention and Control. Last ',
  62. 'accessed 28 April, 2020.')),
  63. c('acc_deaths', '2020', paste0('Engineered based on data from ',
  64. 'European Centre for Disease ',
  65. 'Prevention and Control. Last ',
  66. 'accessed 28 April, 2020.')),
  67. c('lethality_rate_percent', '2020', paste0('Engineered based on data from ',
  68. 'European Centre for Disease ',
  69. 'Prevention and Control. Last ',
  70. 'accessed April, 2020.')),
  71. c('retail_recreation', '2020', paste0('Google Community Mobility ',
  72. 'Report. Last accessed 28 ',
  73. 'April, 2020')),
  74. c('grocery_pharmacy', '2020', paste0('Google Community Mobility ',
  75. 'Report. Last accessed 28 ',
  76. 'April, 2020')),
  77. c('parks', '2020', paste0('Google Community Mobility ',
  78. 'Report. Last accessed 28 ',
  79. 'April, 2020')),
  80. c('transit_stations', '2020', paste0('Google Community Mobility ',
  81. 'Report. Last accessed 28 ',
  82. 'April, 2020')),
  83. c('workplaces', '2020', paste0('Google Community Mobility ',
  84. 'Report. Last accessed 28 ',
  85. 'April, 2020')),
  86. c('residential', '2020', paste0('Google Community Mobility ',
  87. 'Report. Last accessed 28 ',
  88. 'April, 2020')),
  89. c('first_case_date', '2020',
  90. paste0('https://en.wikipedia.org/w/index.php?title=2019%E2%80%93',
  91. '20_coronavirus_pandemic_by_country_and_territory&oldid=9',
  92. '53662872 Last accessed April 28, 2020')),
  93. c('n_days_since_1st_case', '2020',
  94. paste0('Engineering from ECDC and https://en.wikipedia.org/w/index.php?title=2019%E2%80%93',
  95. '20_coronavirus_pandemic_by_country_and_territory&oldid=9',
  96. '53662872 Last accessed April 28, 2020')),
  97. c('first_death_date', '2020', paste0('Engineering based on ',
  98. 'data from ECDC ',
  99. 'counting the first death ',
  100. 'after February, 15th.')),
  101. c('n_days_since_1st_death', '2020', paste0('Engineering based on ',
  102. 'data from ECDC ',
  103. 'counting the first death ',
  104. 'after February, 15th.'))
  105. )
  106. colnames(df) <- c('Variable name', 'Year', 'Source')
  107. # Add description for some variables
  108. df$Description <- NULL
  109. WriteXLS(x = df, ExcelFileName = 'data_dictionary.xls',
  110. SheetNames = 'Data Dictionary')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...