Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

clean.py 5.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
  1. """
  2. Normalize curtailment data form the CAISO.
  3. The goal of this module is to produce a set of year/month.
  4. Inputs:
  5. xlsx:sheet_name="Production"
  6. Date datetime64[ns]
  7. Hour int64
  8. Interval int64
  9. Load float64
  10. Solar float64
  11. Wind float64
  12. Net Load float64
  13. Renewables float64
  14. Nuclear float64
  15. Large Hydro float64
  16. Imports float64
  17. Generation float64
  18. Thermal float64
  19. Load Less (Generation+Imports) float64
  20. xlsx:sheet_name="Curtailments"
  21. Date datetime64[ns]
  22. Hour int64
  23. Interval int64
  24. Wind Curtailment float64
  25. Solar Curtailment float64
  26. Outputs:
  27. A parquet dataset with grain of 5-min resolution datetimes.
  28. parquet
  29. Load float64
  30. Solar float64
  31. Wind float64
  32. Net Load float64
  33. Renewables float64
  34. Nuclear float64
  35. Large Hydro float64
  36. Imports float64
  37. Generation float64
  38. Thermal float64
  39. Load Less (Generation+Imports) float64
  40. Wind Curtailment float64
  41. Solar Curtailment float64
  42. """
  43. import pandas as pd
  44. import warnings
  45. from pathlib import Path
  46. from loguru import logger
  47. from src.conf import settings
  48. # Ignore pandas warnings about str.contains - we are using it as a mask!
  49. warnings.filterwarnings("ignore", "This pattern has match groups")
  50. def parse(fp):
  51. """
  52. Pipeline:
  53. 1. Align each row for Production and Curtailments table to a single timeseries
  54. 2. Merge both tables to a single table
  55. 3. Write out a utc-aware timestamp indexed file with all MW columns
  56. """
  57. xl_file = pd.ExcelFile(fp)
  58. production = xl_file.parse("Production")
  59. curtailments = xl_file.parse("Curtailments")
  60. # Parse Production Timestamps -
  61. # "Date" is a naive timestamp in US/Pacific
  62. # Ambiguous timestamps should be inferred since the data are ordered and hopefully complete
  63. production["timestamp"] = production["Date"].dt.tz_localize(
  64. tz="US/Pacific", ambiguous="infer"
  65. )
  66. # Parse Curtailments Timestamps
  67. # "Date" is a naive date string with no time info
  68. curtailments["timestamp"] = (
  69. curtailments["Date"]
  70. + pd.to_timedelta( # Add the Curtailment Hour; 1-indexed => 1 == starting hour midnight
  71. curtailments["Hour"] - 1, unit="H"
  72. )
  73. + pd.to_timedelta( # Add the Curtailment interval; 5-min, and 1-indexed => 1 == start of the hour
  74. (curtailments["Interval"] - 1) * 5, unit="min"
  75. )
  76. ).dt.tz_localize(tz="US/Pacific", ambiguous=True)
  77. data = production.set_index("timestamp", verify_integrity=True).merge(
  78. curtailments.set_index("timestamp", verify_integrity=True),
  79. left_index=True,
  80. right_index=True,
  81. how="left",
  82. )
  83. # Check that merging data left us with the correct shape
  84. assert data.shape[0] == production.shape[0]
  85. assert (
  86. data.shape[1] == production.shape[1] + curtailments.shape[1] - 2
  87. ) # subtract 2 for idx cols
  88. # Note that we expect to lose some curtailments data because the production data is not complete!
  89. try:
  90. assert (
  91. curtailments["Solar Curtailment"].notnull().sum()
  92. == data["Solar Curtailment"].notnull().sum()
  93. )
  94. except AssertionError:
  95. logger.warning(
  96. "Production data and curtailment data failed to line up. The CAISO data may be missing data. [{timestamps}]",
  97. timestamps=curtailments.set_index("timestamp", verify_integrity=True)
  98. .index.difference(data.index)
  99. .tolist(),
  100. )
  101. cols_to_drop = data.columns[data.columns.str.contains("_(x|y)$")].tolist()
  102. logger.debug("Dropping columns from CAISO data: {cols}", cols=cols_to_drop)
  103. data.drop(columns=cols_to_drop, inplace=True)
  104. return data
  105. def main(output_dir=settings.DATA_DIR / "processed/caiso"):
  106. """Parse each CAISO file, and output a single timeseries per year.
  107. """
  108. output_dir.mkdir(exist_ok=True)
  109. for fp in settings.DATA_DIR.glob("raw/caiso/*xlsx"):
  110. logger.info("Parsing raw caiso file: {fp}", fp=fp)
  111. data = parse(fp)
  112. for partition, df in data.groupby(pd.Grouper(level=0, freq="Y")):
  113. name = f"{partition.year}.parquet"
  114. # Write to UTC to avoid ambiguous DST timestamps!
  115. output_fp = output_dir / name
  116. # If the file already exists, assume we should be appending to it
  117. append = output_fp.exists()
  118. logger.info(
  119. "Writing (append={append}) {year} data from {fp}",
  120. append=append,
  121. year=partition.year,
  122. fp=fp,
  123. )
  124. df.tz_convert("UTC").to_parquet(
  125. output_fp, engine="fastparquet", append=append
  126. )
  127. if __name__ == "__main__":
  128. # Clean Data
  129. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...