Data Analysis Examples
This page demonstrates how to use PyEPISuite for comprehensive data analysis workflows, from basic property exploration to advanced statistical analysis.
Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyepisuite import search_episuite_by_cas, submit_to_episuite
from pyepisuite.dataframe_utils import (
episuite_to_dataframe,
ecosar_to_dataframe,
combine_episuite_ecosar_dataframes,
create_summary_statistics
)
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
Dataset Preparation
Let's analyze a diverse set of chemicals including pharmaceuticals, pesticides, and industrial chemicals:
# Define a comprehensive dataset
chemicals = {
'pharmaceuticals': ['50-78-2', '58-08-2', '50-02-2'], # Aspirin, Caffeine, Dexamethasone
'pesticides': ['50-29-3', '72-20-8', '309-00-2'], # DDT, Endrin, Aldrin
'industrial': ['100-41-4', '108-88-3', '71-43-2'], # Ethylbenzene, Toluene, Benzene
'solvents': ['67-56-1', '64-17-5', '67-64-1'], # Methanol, Ethanol, Acetone
}
# Flatten the list
all_cas = [cas for category in chemicals.values() for cas in category]
# Get data
print(f"Analyzing {len(all_cas)} chemicals...")
ids = search_episuite_by_cas(all_cas)
epi_results, ecosar_results = submit_to_episuite(ids)
# Convert to DataFrames
epi_df = episuite_to_dataframe(epi_results)
ecosar_df = ecosar_to_dataframe(ecosar_results)
combined_df = combine_episuite_ecosar_dataframes(epi_df, ecosar_df)
# Add chemical categories
category_map = {}
for category, cas_list in chemicals.items():
for cas in cas_list:
category_map[cas] = category
epi_df['category'] = epi_df['cas'].map(category_map)
combined_df['category'] = combined_df['cas'].map(category_map)
print(f"Successfully analyzed {len(epi_df)} chemicals")
print(f"Categories: {epi_df['category'].value_counts().to_dict()}")
Exploratory Data Analysis
Property Distributions by Category
# Select key properties for analysis
key_properties = [
'log_kow_estimated',
'water_solubility_logkow_estimated',
'log_bioconcentration_factor',
'atmospheric_half_life_estimated',
'vapor_pressure_estimated'
]
# Create subplots for each property
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()
for i, prop in enumerate(key_properties):
ax = axes[i]
# Box plot by category
epi_df.boxplot(column=prop, by='category', ax=ax)
ax.set_title(f'{prop.replace("_", " ").title()}')
ax.set_xlabel('Chemical Category')
ax.set_ylabel('Value')
# Rotate x-axis labels
ax.tick_params(axis='x', rotation=45)
# Remove empty subplot
if len(key_properties) < len(axes):
fig.delaxes(axes[-1])
plt.tight_layout()
plt.suptitle('Property Distributions by Chemical Category', y=1.02)
plt.show()
Correlation Analysis
# Calculate correlations for numeric properties
numeric_props = epi_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_props = [col for col in numeric_props if 'estimated' in col or col in ['molecular_weight']]
# Create correlation matrix
corr_matrix = epi_df[numeric_props].corr()
# Plot enhanced heatmap
plt.figure(figsize=(14, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
corr_matrix,
mask=mask,
annot=True,
cmap='RdBu_r',
center=0,
square=True,
linewidths=0.5,
cbar_kws={"shrink": 0.8}
)
plt.title('Property Correlation Matrix', fontsize=16, pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
# Find strongest correlations
corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_pairs.append({
'property1': corr_matrix.columns[i],
'property2': corr_matrix.columns[j],
'correlation': corr_matrix.iloc[i, j]
})
corr_df = pd.DataFrame(corr_pairs)
print("\\nStrongest correlations (|r| > 0.7):")
strong_corr = corr_df[abs(corr_df['correlation']) > 0.7].sort_values('correlation', key=abs, ascending=False)
print(strong_corr.head(10).to_string(index=False))
Advanced Analysis
Principal Component Analysis
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Prepare data for PCA
pca_props = [
'log_kow_estimated', 'water_solubility_logkow_estimated',
'log_bioconcentration_factor', 'atmospheric_half_life_estimated',
'vapor_pressure_estimated', 'molecular_weight'
]
# Remove rows with missing values
pca_data = epi_df[pca_props + ['category', 'name']].dropna()
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(pca_data[pca_props])
# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(scaled_features)
# Create DataFrame with PCA results
pca_df = pd.DataFrame(
pca_result[:, :3],
columns=['PC1', 'PC2', 'PC3'],
index=pca_data.index
)
pca_df['category'] = pca_data['category']
pca_df['name'] = pca_data['name']
# Plot explained variance
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.subplot(1, 2, 2)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
np.cumsum(pca.explained_variance_ratio_), 'bo-')
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance')
plt.axhline(y=0.8, color='r', linestyle='--', alpha=0.7, label='80%')
plt.legend()
plt.tight_layout()
plt.show()
print(f"First 3 components explain {sum(pca.explained_variance_ratio_[:3]):.1%} of variance")
# Plot PCA results
plt.figure(figsize=(12, 8))
categories = pca_df['category'].unique()
colors = plt.cm.Set1(np.linspace(0, 1, len(categories)))
for category, color in zip(categories, colors):
mask = pca_df['category'] == category
plt.scatter(pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'],
c=[color], label=category, alpha=0.7, s=100)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('PCA of Chemical Properties')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Feature loadings
loadings = pd.DataFrame(
pca.components_[:3].T,
columns=['PC1', 'PC2', 'PC3'],
index=pca_props
)
print("\\nFeature loadings (first 3 components):")
print(loadings.round(3))
Clustering Analysis
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Determine optimal number of clusters
k_range = range(2, 8)
silhouette_scores = []
inertias = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_features)
silhouette_avg = silhouette_score(scaled_features, cluster_labels)
silhouette_scores.append(silhouette_avg)
inertias.append(kmeans.inertia_)
# Plot elbow curve and silhouette scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method')
ax1.grid(True, alpha=0.3)
ax2.plot(k_range, silhouette_scores, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Analysis')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Perform clustering with optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_features)
# Add cluster labels to DataFrame
pca_data_clustered = pca_data.copy()
pca_data_clustered['cluster'] = cluster_labels
# Visualize clusters in PCA space
plt.figure(figsize=(12, 8))
scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1],
c=cluster_labels, cmap='viridis', alpha=0.7, s=100)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Chemical Clusters in PCA Space')
plt.colorbar(scatter, label='Cluster')
# Add cluster centers
centers_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1],
c='red', marker='x', s=200, linewidths=3, label='Centroids')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Analyze cluster characteristics
print("\\nCluster Characteristics:")
cluster_summary = pca_data_clustered.groupby('cluster')[pca_props].mean()
print(cluster_summary.round(3))
print("\\nChemicals by cluster:")
for cluster in range(optimal_k):
chemicals_in_cluster = pca_data_clustered[pca_data_clustered['cluster'] == cluster]['name'].tolist()
print(f"Cluster {cluster}: {', '.join(chemicals_in_cluster)}")
Environmental Risk Assessment
Bioaccumulation Potential
# Classify chemicals by bioaccumulation potential
def classify_bioaccumulation(log_bcf):
if pd.isna(log_bcf):
return 'Unknown'
elif log_bcf < 2:
return 'Low'
elif log_bcf < 3:
return 'Moderate'
else:
return 'High'
epi_df['bcf_category'] = epi_df['log_bioconcentration_factor'].apply(classify_bioaccumulation)
# Plot bioaccumulation categories
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Overall distribution
bcf_counts = epi_df['bcf_category'].value_counts()
ax1.pie(bcf_counts.values, labels=bcf_counts.index, autopct='%1.1f%%', startangle=90)
ax1.set_title('Bioaccumulation Potential Distribution')
# By chemical category
bcf_by_category = pd.crosstab(epi_df['category'], epi_df['bcf_category'])
bcf_by_category.plot(kind='bar', stacked=True, ax=ax2)
ax2.set_title('Bioaccumulation Potential by Chemical Category')
ax2.set_xlabel('Chemical Category')
ax2.set_ylabel('Number of Chemicals')
ax2.legend(title='BCF Category')
ax2.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
print("Bioaccumulation classification:")
print(bcf_by_category)
Persistence Assessment
# Classify persistence based on atmospheric half-life
def classify_persistence(half_life_hours):
if pd.isna(half_life_hours):
return 'Unknown'
elif half_life_hours < 24:
return 'Non-persistent'
elif half_life_hours < 168: # 1 week
return 'Moderately persistent'
else:
return 'Persistent'
epi_df['persistence_category'] = epi_df['atmospheric_half_life_estimated'].apply(classify_persistence)
# Create risk matrix
risk_matrix = pd.crosstab(epi_df['persistence_category'], epi_df['bcf_category'])
# Plot risk matrix heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(risk_matrix, annot=True, fmt='d', cmap='Reds',
cbar_kws={'label': 'Number of Chemicals'})
plt.title('Environmental Risk Matrix\\n(Persistence vs Bioaccumulation)')
plt.xlabel('Bioaccumulation Potential')
plt.ylabel('Persistence')
plt.tight_layout()
plt.show()
print("Environmental Risk Matrix:")
print(risk_matrix)
# Identify high-risk chemicals
high_risk = epi_df[
(epi_df['persistence_category'] == 'Persistent') &
(epi_df['bcf_category'] == 'High')
]
if len(high_risk) > 0:
print(f"\\nHigh-risk chemicals (Persistent + High BCF): {len(high_risk)}")
print(high_risk[['name', 'cas', 'log_bioconcentration_factor',
'atmospheric_half_life_estimated']].to_string(index=False))
else:
print("\\nNo chemicals classified as high-risk (Persistent + High BCF)")
Ecotoxicity Analysis
# Analyze EcoSAR predictions
print("EcoSAR Analysis Summary:")
print(f"Total predictions: {len(ecosar_df)}")
print(f"Unique chemicals: {ecosar_df['cas'].nunique()}")
print(f"Organisms: {', '.join(ecosar_df['organism'].unique())}")
print(f"Endpoints: {', '.join(ecosar_df['endpoint'].unique())}")
# Plot toxicity distributions by organism
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
organisms = ecosar_df['organism'].unique()
for i, organism in enumerate(organisms):
org_data = ecosar_df[ecosar_df['organism'] == organism]
# Log-transform concentrations for better visualization
log_conc = np.log10(org_data['concentration'])
axes[i].hist(log_conc, bins=15, alpha=0.7, edgecolor='black')
axes[i].set_title(f'{organism.title()} Toxicity')
axes[i].set_xlabel('Log10 Concentration (mg/L)')
axes[i].set_ylabel('Frequency')
axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Most toxic chemicals (lowest concentrations)
print("\\nMost toxic chemicals by organism:")
for organism in organisms:
org_data = ecosar_df[ecosar_df['organism'] == organism]
most_toxic = org_data.nsmallest(3, 'concentration')[['cas', 'concentration', 'endpoint']]
print(f"\\n{organism.title()}:")
for _, row in most_toxic.iterrows():
chemical_name = epi_df[epi_df['cas'] == row['cas']]['name'].iloc[0] if len(epi_df[epi_df['cas'] == row['cas']]) > 0 else 'Unknown'
print(f" {chemical_name} (CAS: {row['cas']}): {row['concentration']:.3f} mg/L ({row['endpoint']})")
Statistical Analysis
ANOVA: Property Differences Between Categories
from scipy import stats
# Perform ANOVA for each property
properties_for_anova = [
'log_kow_estimated', 'water_solubility_logkow_estimated',
'log_bioconcentration_factor', 'atmospheric_half_life_estimated'
]
anova_results = []
for prop in properties_for_anova:
# Prepare data for ANOVA (remove NaN values)
clean_data = epi_df[[prop, 'category']].dropna()
# Group by category
groups = [group[prop].values for name, group in clean_data.groupby('category')]
# Perform ANOVA
f_stat, p_value = stats.f_oneway(*groups)
anova_results.append({
'property': prop,
'f_statistic': f_stat,
'p_value': p_value,
'significant': p_value < 0.05
})
anova_df = pd.DataFrame(anova_results)
print("ANOVA Results (Property differences between chemical categories):")
print(anova_df.round(4))
# Post-hoc analysis for significant results
significant_props = anova_df[anova_df['significant']]['property'].tolist()
if significant_props:
print(f"\\nSignificant differences found in: {', '.join(significant_props)}")
# Perform Tukey's HSD test for significant properties
from scipy.stats import tukey_hsd
for prop in significant_props[:2]: # Limit to first 2 for brevity
clean_data = epi_df[[prop, 'category']].dropna()
groups = [group[prop].values for name, group in clean_data.groupby('category')]
categories = list(clean_data.groupby('category').groups.keys())
try:
tukey_result = tukey_hsd(*groups)
print(f"\\nTukey HSD test for {prop}:")
# Create pairwise comparison table
n_groups = len(categories)
for i in range(n_groups):
for j in range(i+1, n_groups):
pvalue = tukey_result.pvalue[i, j]
if pvalue < 0.05:
print(f" {categories[i]} vs {categories[j]}: p = {pvalue:.4f} (significant)")
except:
print(f"Could not perform Tukey test for {prop}")
Report Generation
def generate_analysis_report(epi_df, ecosar_df):
"""Generate a comprehensive analysis report."""
report = {
'dataset_summary': {
'total_chemicals': len(epi_df),
'chemical_categories': epi_df['category'].value_counts().to_dict(),
'properties_analyzed': len([col for col in epi_df.columns if 'estimated' in col])
},
'property_statistics': {},
'risk_assessment': {},
'ecotoxicity_summary': {}
}
# Property statistics
numeric_props = epi_df.select_dtypes(include=[np.number]).columns
for prop in numeric_props:
if 'estimated' in prop:
report['property_statistics'][prop] = {
'mean': float(epi_df[prop].mean()) if not pd.isna(epi_df[prop].mean()) else None,
'std': float(epi_df[prop].std()) if not pd.isna(epi_df[prop].std()) else None,
'min': float(epi_df[prop].min()) if not pd.isna(epi_df[prop].min()) else None,
'max': float(epi_df[prop].max()) if not pd.isna(epi_df[prop].max()) else None,
'missing_pct': float((epi_df[prop].isna().sum() / len(epi_df)) * 100)
}
# Risk assessment summary
if 'bcf_category' in epi_df.columns:
report['risk_assessment']['bioaccumulation'] = epi_df['bcf_category'].value_counts().to_dict()
if 'persistence_category' in epi_df.columns:
report['risk_assessment']['persistence'] = epi_df['persistence_category'].value_counts().to_dict()
# Ecotoxicity summary
if len(ecosar_df) > 0:
report['ecotoxicity_summary'] = {
'total_predictions': len(ecosar_df),
'organisms': ecosar_df['organism'].value_counts().to_dict(),
'endpoints': ecosar_df['endpoint'].value_counts().to_dict(),
'concentration_stats': {
'mean': float(ecosar_df['concentration'].mean()),
'median': float(ecosar_df['concentration'].median()),
'min': float(ecosar_df['concentration'].min()),
'max': float(ecosar_df['concentration'].max())
}
}
return report
# Generate and display report
analysis_report = generate_analysis_report(epi_df, ecosar_df)
print("=== ANALYSIS REPORT ===\\n")
print("Dataset Summary:")
for key, value in analysis_report['dataset_summary'].items():
print(f" {key.replace('_', ' ').title()}: {value}")
print("\\nProperty Statistics (top 5 by completeness):")
prop_stats = analysis_report['property_statistics']
complete_props = sorted(prop_stats.items(), key=lambda x: x[1]['missing_pct'])[:5]
for prop, stats in complete_props:
print(f" {prop}:")
print(f" Mean ± SD: {stats['mean']:.3f} ± {stats['std']:.3f}")
print(f" Range: {stats['min']:.3f} to {stats['max']:.3f}")
print(f" Missing: {stats['missing_pct']:.1f}%")
if 'bioaccumulation' in analysis_report['risk_assessment']:
print("\\nBioaccumulation Risk Distribution:")
for category, count in analysis_report['risk_assessment']['bioaccumulation'].items():
print(f" {category}: {count} chemicals")
if analysis_report['ecotoxicity_summary']:
print("\\nEcotoxicity Summary:")
eco_stats = analysis_report['ecotoxicity_summary']['concentration_stats']
print(f" Concentration range: {eco_stats['min']:.3f} - {eco_stats['max']:.3f} mg/L")
print(f" Median toxicity: {eco_stats['median']:.3f} mg/L")
Exporting Results
# Prepare comprehensive export
export_data = {
'Summary': create_summary_statistics(epi_df),
'Chemical_Properties': epi_df,
'Ecotoxicity_Predictions': ecosar_df,
'Risk_Classification': epi_df[['name', 'cas', 'category', 'bcf_category', 'persistence_category']],
'Correlation_Matrix': corr_matrix,
'PCA_Results': pca_df,
'Cluster_Analysis': pca_data_clustered
}
# Export to Excel
from pyepisuite.dataframe_utils import export_to_excel
export_to_excel(export_data, 'comprehensive_chemical_analysis.xlsx')
print("\\nAnalysis complete! Results exported to 'comprehensive_chemical_analysis.xlsx'")
print("\\nKey findings:")
print(f"- Analyzed {len(epi_df)} chemicals across {len(epi_df['category'].unique())} categories")
print(f"- Generated {len(ecosar_df)} ecotoxicity predictions")
print(f"- Identified {len(epi_df[epi_df['bcf_category'] == 'High'])} chemicals with high bioaccumulation potential")
print(f"- Found {len(strong_corr)} strong property correlations (|r| > 0.7)")
This comprehensive example demonstrates how PyEPISuite can be used for sophisticated environmental data analysis, combining chemical property predictions, statistical analysis, and risk assessment in a cohesive workflow.