1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
|
- #!/usr/bin/env python3
- """
- Create rigorous, professional charts for AI bias evaluation article
- Removing sensationalistic language and ensuring data accuracy
- """
- import matplotlib.pyplot as plt
- import numpy as np
- def create_primary_findings_chart():
- """Create a clean chart showing the primary findings"""
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
-
- # Left panel: Uncertainty acknowledgment change
- models = ['GPT-3.5-turbo\n(March 2023)', 'GPT-5-nano\n(August 2025)']
- unknown_rates = [13.1, 29.9]
- colors = ['#E63946', '#06D6A0']
-
- bars1 = ax1.bar(models, unknown_rates, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
-
- # Add value labels
- for i, (bar, rate) in enumerate(zip(bars1, unknown_rates)):
- ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
- f'{rate}%', ha='center', va='bottom', fontweight='bold', fontsize=12)
-
- # Add difference annotation
- ax1.annotate('', xy=(1, 29.9), xytext=(0, 13.1),
- arrowprops=dict(arrowstyle='<->', color='blue', lw=2))
- ax1.text(0.5, 21.5, '+16.8\npercentage\npoints', ha='center', va='center',
- fontweight='bold', fontsize=11, color='blue',
- bbox=dict(boxstyle="round,pad=0.3", facecolor='lightblue', alpha=0.8))
-
- ax1.set_ylabel('Uncertainty Acknowledgment Rate (%)', fontsize=12, fontweight='bold')
- ax1.set_title('Ambiguous Context Behavior\n"Unknown" Selection Rate', fontsize=14, fontweight='bold')
- ax1.set_ylim(0, 35)
- ax1.grid(axis='y', alpha=0.3)
-
- # Right panel: Overall accuracy comparison
- overall_rates = [53.0, 61.2]
- bars2 = ax2.bar(models, overall_rates, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
-
- for i, (bar, rate) in enumerate(zip(bars2, overall_rates)):
- ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
- f'{rate}%', ha='center', va='bottom', fontweight='bold', fontsize=12)
-
- ax2.annotate('', xy=(1, 61.2), xytext=(0, 53.0),
- arrowprops=dict(arrowstyle='<->', color='green', lw=2))
- ax2.text(0.5, 57.1, '+8.2\npercentage\npoints', ha='center', va='center',
- fontweight='bold', fontsize=11, color='green',
- bbox=dict(boxstyle="round,pad=0.3", facecolor='lightgreen', alpha=0.8))
-
- ax2.set_ylabel('Overall Accuracy Rate (%)', fontsize=12, fontweight='bold')
- ax2.set_title('Overall Performance\nCombined Accuracy', fontsize=14, fontweight='bold')
- ax2.set_ylim(45, 65)
- ax2.grid(axis='y', alpha=0.3)
-
- plt.suptitle('AI Bias Evaluation: Key Behavioral Changes\nGPT-3.5-turbo to GPT-5-nano (116,972 tests)',
- fontsize=16, fontweight='bold', y=0.98)
-
- plt.tight_layout()
- plt.savefig('/Users/mdangelo/projects/pf2/site/static/img/blog/bbq-bias/primary-findings-rigorous.png',
- dpi=300, bbox_inches='tight', facecolor='white')
- plt.close()
- def create_development_timeline_chart():
- """Create a clean timeline chart without sensationalistic framing"""
- fig, ax = plt.subplots(1, 1, figsize=(14, 8))
-
- # Data points
- models = ['GPT-3.5-turbo', 'GPT-5-nano']
- overall_accuracy = [53.0, 61.2]
- ambiguous_unknown = [13.1, 29.9]
-
- x_pos = [0, 1]
- width = 0.35
-
- # Create bars
- bars1 = ax.bar([p - width/2 for p in x_pos], overall_accuracy, width,
- label='Overall Accuracy', color='#2E86AB', alpha=0.8)
- bars2 = ax.bar([p + width/2 for p in x_pos], ambiguous_unknown, width,
- label='Uncertainty Acknowledgment\n(Ambiguous Contexts)', color='#A23B72', alpha=0.8)
-
- # Add value labels
- for bars in [bars1, bars2]:
- for bar in bars:
- height = bar.get_height()
- ax.text(bar.get_x() + bar.get_width()/2., height + 0.5,
- f'{height}%', ha='center', va='bottom', fontweight='bold')
-
- ax.set_ylabel('Performance Rate (%)', fontsize=12, fontweight='bold')
- ax.set_xlabel('Model (Release Date)', fontsize=12, fontweight='bold')
- ax.set_title('AI Bias Evolution: 2.5 Years of Development\nMarch 2023 to August 2025',
- fontsize=16, fontweight='bold')
-
- ax.set_xticks(x_pos)
- ax.set_xticklabels(['GPT-3.5-turbo\n(March 1, 2023)', 'GPT-5-nano\n(August 7, 2025)'])
- ax.legend(loc='upper left', fontsize=11)
- ax.grid(axis='y', alpha=0.3)
- ax.set_ylim(0, 70)
-
- # Add timeline context
- ax.text(0.5, 65, '2 years, 5 months of development', ha='center', va='center',
- fontsize=12, style='italic',
- bbox=dict(boxstyle="round,pad=0.5", facecolor='lightyellow', alpha=0.8))
-
- plt.tight_layout()
- plt.savefig('/Users/mdangelo/projects/pf2/site/static/img/blog/bbq-bias/development-timeline-rigorous.png',
- dpi=300, bbox_inches='tight', facecolor='white')
- plt.close()
- def create_methodology_accessibility_chart():
- """Create honest chart about methodology accessibility without inflated comparisons"""
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
-
- # Left panel: Cost breakdown
- components = ['GPT-3.5-turbo\n(7.81M tokens)', 'GPT-5-nano\n(22.09M tokens)', 'Total Cost']
- costs = [3.96, 6.16, 10.13]
- colors = ['#E63946', '#06D6A0', '#2E86AB']
-
- bars = ax1.bar(components, costs, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
-
- for bar, cost in zip(bars, costs):
- ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
- f'${cost}', ha='center', va='bottom', fontweight='bold', fontsize=12)
-
- ax1.set_ylabel('Cost (USD)', fontsize=12, fontweight='bold')
- ax1.set_title('Evaluation Cost Breakdown\n116,972 Total Tests', fontsize=14, fontweight='bold')
- ax1.set_ylim(0, 12)
- ax1.grid(axis='y', alpha=0.3)
-
- # Right panel: Evaluation scope
- metrics = ['Total Tests', 'Social Bias\nCategories', 'Models\nCompared', 'Cost per\n1000 Tests']
- values = [116972, 11, 2, 0.087] # $10.13 / 116.972 = $0.087 per 1000 tests
-
- # Use different scales for different metrics
- ax2_twin = ax2.twinx()
-
- # Plot total tests on left axis
- bar1 = ax2.bar('Total Tests', 116972, color='#2E86AB', alpha=0.8, width=0.6)
- ax2.text(0, 120000, '116,972', ha='center', va='bottom', fontweight='bold', fontsize=11)
-
- # Plot other metrics on right axis
- other_metrics = ['Social Bias\nCategories', 'Models\nCompared', 'Cost per\n1000 Tests']
- other_values = [11, 2, 0.087]
- x_pos = [1, 2, 3]
- bars2 = ax2_twin.bar(x_pos, other_values, color=['#A23B72', '#F18F01', '#06D6A0'],
- alpha=0.8, width=0.6)
-
- for i, (bar, val) in enumerate(zip(bars2, other_values)):
- if i == 2: # Cost per 1000 tests
- label = f'${val:.3f}'
- else:
- label = str(int(val))
- ax2_twin.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.2,
- label, ha='center', va='bottom', fontweight='bold', fontsize=11)
-
- ax2.set_ylabel('Number of Tests', fontsize=12, fontweight='bold', color='#2E86AB')
- ax2_twin.set_ylabel('Count / Cost per 1K Tests', fontsize=12, fontweight='bold')
- ax2.set_title('Comprehensive Evaluation Scope\nAccessible Methodology', fontsize=14, fontweight='bold')
-
- ax2.set_xticks([0, 1, 2, 3])
- ax2.set_xticklabels(['Total Tests'] + other_metrics, rotation=15, ha='right')
- ax2.set_ylim(0, 140000)
- ax2_twin.set_ylim(0, 15)
-
- plt.suptitle('Bias Evaluation Methodology: Cost and Scope\nStandardized, Reproducible Assessment',
- fontsize=16, fontweight='bold', y=0.95)
-
- plt.tight_layout()
- plt.savefig('/Users/mdangelo/projects/pf2/site/static/img/blog/bbq-bias/methodology-accessibility-rigorous.png',
- dpi=300, bbox_inches='tight', facecolor='white')
- plt.close()
- def main():
- print("šØ Creating rigorous, professional charts...")
- print("š§ Removing sensationalistic language")
- print("š Ensuring data accuracy and clarity")
-
- create_primary_findings_chart()
- print("ā
Primary findings chart created")
-
- create_development_timeline_chart()
- print("ā
Development timeline chart created")
-
- create_methodology_accessibility_chart()
- print("ā
Methodology accessibility chart created")
-
- print("\nš All rigorous charts generated successfully!")
- print("šÆ Charts now match the professional tone of the cleaned article")
- if __name__ == "__main__":
- main()
|