import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from datetime import datetime, timedelta
from prophet import Prophet
import warnings
warnings.filterwarnings('ignore')
class SurveillanceDashboard:
"""
Real-time surveillance dashboard with multiple data sources and anomaly detection
"""
def __init__(self):
self.data = None
self.alerts = []
self.prophet_model = None
def load_data(self, csv_path=None):
"""Load surveillance data from CSV or generate synthetic"""
if csv_path:
self.data = pd.read_csv(csv_path, parse_dates=['date'])
else:
# Generate synthetic data
dates = pd.date_range('2023-01-01', '2024-12-31', freq='D')
n_days = len(dates)
# Seasonal baseline
day_of_year = dates.dayofyear
seasonal = 50 + 30 * np.cos(2 * np.pi * (day_of_year - 15) / 365)
# Add outbreak in Oct 2024
baseline = seasonal + np.random.normal(0, 5, n_days)
baseline = np.maximum(baseline, 0)
outbreak_start = (dates >= '2024-10-01') & (dates <= '2024-11-15')
baseline[outbreak_start] += 50 + np.random.normal(0, 10, outbreak_start.sum())
self.data = pd.DataFrame({
'date': dates,
'syndromic_counts': baseline,
'test_positivity': np.random.uniform(5, 15, n_days),
'hospitalizations': baseline * 0.05 + np.random.normal(0, 2, n_days)
})
def detect_anomalies(self, column='syndromic_counts', method='prophet'):
"""Detect anomalies using Prophet"""
# Prepare data for Prophet
df_prophet = self.data[['date', column]].copy()
df_prophet.columns = ['ds', 'y']
# Split train/test
train_df = df_prophet[df_prophet['ds'] < '2024-10-01']
# Fit Prophet
self.prophet_model = Prophet(
yearly_seasonality=True,
weekly_seasonality=True,
changepoint_prior_scale=0.05,
interval_width=0.95
)
self.prophet_model.fit(train_df)
# Predict
forecast = self.prophet_model.predict(df_prophet[['ds']])
# Merge with actual
self.data['expected'] = forecast['yhat'].values
self.data['lower_bound'] = forecast['yhat_lower'].values
self.data['upper_bound'] = forecast['yhat_upper'].values
# Detect anomalies
self.data['anomaly'] = (
(self.data[column] < self.data['lower_bound']) |
(self.data[column] > self.data['upper_bound'])
)
self.data['anomaly_score'] = np.abs(
self.data[column] - self.data['expected']
) / (self.data['upper_bound'] - self.data['lower_bound'])
# Generate alerts for sustained anomalies
self._generate_alerts(column)
def _generate_alerts(self, column, min_duration=3):
"""Generate alerts for sustained anomalies"""
from scipy.ndimage import label
anomaly_regions, n_regions = label(self.data['anomaly'].values)
for region_id in range(1, n_regions + 1):
region_mask = anomaly_regions == region_id
region_length = region_mask.sum()
if region_length >= min_duration:
region_data = self.data[region_mask]
self.alerts.append({
'start_date': region_data['date'].min(),
'end_date': region_data['date'].max(),
'duration_days': region_length,
'mean_anomaly_score': region_data['anomaly_score'].mean(),
'max_value': region_data[column].max(),
'priority': 'HIGH' if region_data['anomaly_score'].mean() > 2 else 'MEDIUM'
})
def create_dashboard(self):
"""Create interactive dashboard with Plotly"""
# Create subplots
fig = make_subplots(
rows=3, cols=1,
subplot_titles=(
'Syndromic Surveillance with Anomaly Detection',
'Test Positivity Rate',
'Anomaly Scores Over Time'
),
vertical_spacing=0.12,
specs=[[{"secondary_y": False}],
[{"secondary_y": False}],
[{"secondary_y": False}]]
)
# Top panel: Syndromic data with forecast
fig.add_trace(
go.Scatter(
x=self.data['date'],
y=self.data['syndromic_counts'],
mode='markers',
name='Actual Counts',
marker=dict(size=4, color='steelblue')
),
row=1, col=1
)
fig.add_trace(
go.Scatter(
x=self.data['date'],
y=self.data['expected'],
mode='lines',
name='Expected (Prophet)',
line=dict(color='blue', width=2)
),
row=1, col=1
)
# Prediction interval
fig.add_trace(
go.Scatter(
x=self.data['date'],
y=self.data['upper_bound'],
mode='lines',
line=dict(width=0),
showlegend=False
),
row=1, col=1
)
fig.add_trace(
go.Scatter(
x=self.data['date'],
y=self.data['lower_bound'],
mode='lines',
fill='tonexty',
fillcolor='rgba(0,100,200,0.2)',
line=dict(width=0),
name='95% Prediction Interval'
),
row=1, col=1
)
# Highlight anomalies
anomalies = self.data[self.data['anomaly']]
fig.add_trace(
go.Scatter(
x=anomalies['date'],
y=anomalies['syndromic_counts'],
mode='markers',
name='Anomalies',
marker=dict(size=10, color='red', symbol='x')
),
row=1, col=1
)
# Middle panel: Test positivity
fig.add_trace(
go.Scatter(
x=self.data['date'],
y=self.data['test_positivity'],
mode='lines',
name='Test Positivity %',
line=dict(color='orange', width=2)
),
row=2, col=1
)
# Bottom panel: Anomaly scores
fig.add_trace(
go.Scatter(
x=self.data['date'],
y=self.data['anomaly_score'],
mode='lines',
name='Anomaly Score',
line=dict(color='purple', width=2)
),
row=3, col=1
)
fig.add_hline(y=1.0, line_dash="dash", line_color="red",
annotation_text="Alert Threshold",
row=3, col=1)
# Update layout
fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="Daily Counts", row=1, col=1)
fig.update_yaxes(title_text="Percentage", row=2, col=1)
fig.update_yaxes(title_text="Anomaly Score", row=3, col=1)
fig.update_layout(
height=900,
title_text="Public Health Surveillance Dashboard",
showlegend=True,
hovermode='x unified'
)
return fig
def generate_alert_report(self):
"""Generate alert report"""
if len(self.alerts) == 0:
return "✓ No alerts - surveillance within normal parameters"
report = f"🚨 {len(self.alerts)} ALERT(S) DETECTED\n"
report += "="*60 + "\n\n"
for i, alert in enumerate(self.alerts, 1):
report += f"Alert #{i}:\n"
report += f" Period: {alert['start_date'].date()} to {alert['end_date'].date()}\n"
report += f" Duration: {alert['duration_days']} days\n"
report += f" Priority: {alert['priority']}\n"
report += f" Peak Value: {alert['max_value']:.1f}\n"
report += f" Mean Anomaly Score: {alert['mean_anomaly_score']:.2f}\n"
report += "\n"
return report
# Run the dashboard
dashboard = SurveillanceDashboard()
print("Loading surveillance data...")
dashboard.load_data()
print("Running anomaly detection...")
dashboard.detect_anomalies()
print("\n" + dashboard.generate_alert_report())
print("Creating interactive dashboard...")
fig = dashboard.create_dashboard()
fig.write_html('surveillance_dashboard.html')
print("✓ Dashboard saved to: surveillance_dashboard.html")
print(" Open this file in your web browser to view the interactive dashboard")
# Also save as static image
fig.write_image('surveillance_dashboard.png', width=1400, height=900)
print("✓ Static image saved to: surveillance_dashboard.png")
6.4 Social Media Surveillance: Lessons from Google Flu Trends
Social media promised revolutionary disease surveillance. The reality has been… complicated.
6.4.1 The Google Flu Trends Story
2008: The Promise
Google researchers published a landmark paper in Nature (Ginsberg et al., 2009) showing that search query patterns could track influenza activity in near real-time.
The method: - Identify 45 search terms correlated with CDC ILINet data - Aggregate searches by region - Use linear model to predict current ILI levels
The results: - 97% correlation with CDC data - 1-2 weeks ahead of traditional surveillance - Updated daily (vs. weekly CDC reports)
The excitement: “Big data” + Machine learning = Real-time disease tracking!
Media proclaimed: “The end of traditional surveillance!”
2013: The Fall
During the 2012-2013 flu season, Google Flu Trends (GFT) massively overestimated influenza prevalence—predicting almost double the actual CDC-reported levels.
The post-mortem analysis (Lazer et al., 2014, Science) identified multiple failures:
1. Algorithm Dynamics (Overfitting) - GFT used 50 million search terms → Selected 45 best correlates - With so many candidate predictors, spurious correlations were inevitable - Example: Searches for “high school basketball” correlated with flu season (both peak in winter) → Algorithm included it
2. Search Behavior Changes - Media coverage of flu → People searched more → Inflated estimates - Google’s search algorithm updates changed which terms appeared - Auto-complete suggestions biased searches
3. No Mechanism, Only Correlation - GFT had no epidemiological model—purely data-driven - When patterns changed (e.g., H1N1 pandemic), algorithm failed - As Lazer et al. wrote: “Big data hubris”—assumption that big data alone, without theory, is sufficient
4. Closed System, No Transparency - Google didn’t reveal which search terms were used - No independent validation possible - When it failed, couldn’t diagnose why
Google Flu Trends teaches us critical principles for public health AI:
For detailed analysis, see Lazer et al., 2014 and Butler, 2013.
6.4.2 Modern Search-Based Surveillance: ARGO
Learning from GFT’s failure, researchers developed ARGO (AutoRegression with Google search data).
Key improvements: - Combines Google Trends data with CDC ILINet (not replacing it) - Uses time series methods (ARIMA) with epidemiological constraints - Regularly recalibrates as patterns change - Transparent (published algorithm, open validation)
Performance: - ~30% improvement over CDC ILINet alone for nowcasting - Useful for filling reporting gaps (e.g., estimating current week before CDC data arrives) - Robust to algorithm changes (because it adapts)
Code example: Simple nowcasting with search trends
Hide code
6.4.3 Twitter/X for Disease Surveillance
Social media offers real-time, high-volume data about health concerns. But it’s noisy, biased, and privacy-sensitive.
Approaches:
1. Keyword-based tracking - Count mentions of “flu”, “fever”, “cough” - Pros: Simple, fast - Cons: Lots of false positives (“I’m sick of this traffic!”)
2. Sentiment analysis - Classify tweets as genuine health concerns vs. casual mentions - Paul et al., 2014 showed reasonable correlation with CDC ILINet
3. Bot detection and filtering - Many “health” tweets are from bots or automated accounts - Must filter to genuine user posts
Challenges:
❌ Selection bias: Twitter users ≠ general population (younger, urban, higher income)
❌ Privacy concerns: Even aggregated health data can reveal sensitive information
❌ Platform changes: API access, data policies constantly evolving
❌ Spam and manipulation: Bots, coordinated campaigns distort signal
❌ Language and cultural variation: Health expressions vary widely
Using social media for health surveillance raises serious concerns:
Best practices: - Aggregate data (never analyze individual accounts) - Remove identifying information - Obtain IRB approval for research use - Be transparent about surveillance activities - Consider community engagement
See Vayena et al., 2015, PLoS Medicine for ethical framework.