Copy # =============================================================================
# Data Discovery Configuration File
# =============================================================================
# Complete configuration matching README.md documentation
# Update with your specific database and environment settings
#
# This file controls all aspects of the Data Discovery application including:
# - Database connectivity and performance
# - PII classification rules and thresholds
# - Trust scoring methodology (Industry standard)
# - Dashboard behavior and caching
# - Business domain mapping and analytics
# =============================================================================
# =============================================================================
# DATABASE CONFIGURATION
# =============================================================================
# Controls how the application connects to and interacts with your database
database:
# Basic connection settings - Update these for your database
host: "localhost" # Database server hostname or IP
port: 1433 # Database port (1433 for SQL Server)
database: "AdventureWorks2022" # Database name to analyze
username: "sa" # Database username
password: "${DB_PASSWORD}" # Use environment variable for security
driver: "sqlserver" # Database type: sqlserver, postgresql, mysql, oracle
# Connection management and timeouts
connection_timeout: 30 # Seconds to wait for initial connection
query_timeout: 300 # Seconds to wait for query completion (5 minutes)
max_connections: 10 # Maximum concurrent database connections
# Performance optimization for large databases
batch_size: 1000 # Number of tables to process in each batch
parallel_workers: 4 # Number of parallel processing threads
# JDBC-specific connection parameters
jdbc_url: null # Custom JDBC URL (overrides host/port if set)
additional_params:
TrustServerCertificate: "true" # Accept self-signed certificates
IntegratedSecurity: "false" # Use SQL Server authentication
# =============================================================================
# DISCOVERY CONFIGURATION
# =============================================================================
# Controls what data is discovered and how the analysis is performed
discovery:
# Database schemas to include in discovery
# Leave empty ([]) to discover all schemas, or specify specific ones
schemas:
- "Sales" # Sales and customer transactions
- "Production" # Manufacturing and inventory data
- "HumanResources" # Employee and HR information
- "Purchasing" # Vendor and procurement data
- "Person" # Customer and contact information
# Data sampling for analysis (affects performance vs. accuracy)
sample_size: 100 # Number of rows to sample per table for pattern analysis
# Usage pattern analysis timeframe
usage_analysis_days: 30 # Days of historical usage data to analyze
# =============================================================================
# CLASSIFICATION CONFIGURATION
# =============================================================================
# Controls PII detection and data classification using Pentaho methodology
classification:
# Classification accuracy vs. performance trade-off
confidence_threshold: 0.7 # Minimum confidence score (0.0-1.0) to classify as PII
# Rule engine settings
enable_custom_rules: true # Enable user-defined classification patterns
enable_adventureworks_rules: true # Enable AdventureWorks-specific classification rules
# Pentaho standard PII categories to detect
# These align with industry-standard privacy frameworks (GDPR, CCPA, etc.)
pii_categories:
- "GOVERNMENT_ID" # SSN, passport, national ID, tax ID
- "PERSONAL_NAME" # First, last, full names, display names
- "CONTACT_INFO" # Email addresses, phone numbers
- "ADDRESS_INFO" # Street, city, zip, postal codes
- "DEMOGRAPHIC" # Birth date, gender, age, ethnicity
- "FINANCIAL" # Salary, credit cards, bank accounts
- "HEALTH" # Medical records, diagnosis, treatment
- "AUTHENTICATION" # Passwords, secrets, tokens
- "BIOMETRIC" # Fingerprints, photos, biometric data
# Performance optimization
cache_results: true # Cache classification results to improve performance
# Custom classification patterns
custom_patterns:
- name: "Custom SSN Pattern"
category: "GOVERNMENT_ID"
sensitivity: "HIGH"
column_patterns: ["ssn", "social_security"]
data_patterns: ["^\\d{3}-?\\d{2}-?\\d{4}$"]
- name: "Email Pattern"
category: "CONTACT_INFO"
sensitivity: "MEDIUM"
column_patterns: ["email", "mail"]
data_patterns: ["^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$"]
# =============================================================================
# TRUST SCORING CONFIGURATION (Industry Methodology)
# =============================================================================
# Implements Industry-standard trust scoring methodology
# Reference: https://academy.pentaho.com/pentaho-data-catalog/use-case/adventure-works/
trust_scoring:
# Industry methodology component weights (must sum to 1.0)
# These weights determine how different factors contribute to the overall trust score
business_value_weight: 0.30 # Business importance and usage (30%)
data_type_weight: 0.25 # Data type appropriateness and consistency (25%)
constraints_weight: 0.25 # Structural integrity and constraints (25%)
naming_weight: 0.20 # Naming conventions and descriptiveness (20%)
# Trust score classification thresholds (0-100 scale)
excellent_threshold: 75 # Excellent governance (75-100) - Green
good_threshold: 65 # Good governance (65-74) - Light Green
fair_threshold: 50 # Fair governance (50-64) - Yellow
# Below 50 = Poor governance - Red
# Risk level assessment thresholds
critical_threshold: 40 # Critical risk (0-39) - Immediate attention required
high_threshold: 60 # High risk (40-59) - Priority remediation
medium_threshold: 75 # Medium risk (60-74) - Monitor and improve
# Above 75 = Low risk - Well governed
# =============================================================================
# DASHBOARD CONFIGURATION
# =============================================================================
# Controls the web dashboard behavior, performance, and user experience
dashboard:
# Caching settings for improved performance
enable_caching: true # Enable server-side caching of dashboard data
cache_duration: 300 # Cache lifetime in seconds (5 minutes)
# User interface display limits
max_results_per_page: 100 # Maximum items per page in filtered views
max_records_display: 10000 # Maximum total records to display in UI
# Performance optimization
enable_compression: true # Compress API responses to reduce bandwidth
# =============================================================================
# COMPLIANCE CONFIGURATION
# =============================================================================
# Defines regulatory frameworks and compliance assessment parameters
compliance:
# Regulatory frameworks to assess against
# The system will evaluate data handling practices against these standards
regulations:
- "GDPR" # General Data Protection Regulation (EU)
- "CCPA" # California Consumer Privacy Act (US)
- "HIPAA" # Health Insurance Portability and Accountability Act (US)
- "SOX" # Sarbanes-Oxley Act (US)
- "PCI_DSS" # Payment Card Industry Data Security Standard
# Compliance monitoring schedule
assessment_frequency: 90 # Days between automated compliance assessments
# =============================================================================
# OUTPUT CONFIGURATION
# =============================================================================
# Controls report generation, file formats, and export behavior
output:
# File system settings
directory: "./reports" # Directory for generated reports and exports
# Supported export formats
formats:
- "csv" # Comma-separated values (data analysis)
- "json" # JavaScript Object Notation (API integration)
- "xlsx" # Excel workbook (business users)
- "pdf" # Portable Document Format (executive reports)
# Data inclusion settings
include_samples: true # Include sample data values in classification reports
# File naming convention
timestamp_format: "%Y%m%d_%H%M%S" # Format: YYYYMMDD_HHMMSS
# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================
# Controls application logging, debugging, and troubleshooting capabilities
logging:
# Log verbosity level
# DEBUG: Detailed diagnostic information
# INFO: General operational messages (recommended)
# WARNING: Important warnings and issues
# ERROR: Only error conditions
level: "INFO"
# Log file location
file: "./logs/data_discovery.log" # Main application log file
# Log rotation to prevent disk space issues
max_file_size: "10MB" # Maximum size before rotation
backup_count: 5 # Number of backup log files to keep
# =============================================================================
# BUSINESS DOMAIN MAPPING
# =============================================================================
# Maps database schemas to business domains and defines their relative importance
business_domains:
# Schema to business domain classification
# This mapping helps contextualize data within business functions
schema_mappings:
Sales: "SALES" # Sales transactions and customer relationships
Production: "MANUFACTURING" # Manufacturing processes and inventory
HumanResources: "HR" # Employee data and human resources
Purchasing: "PROCUREMENT" # Vendor relationships and procurement
Person: "CUSTOMER_DATA" # Customer profiles and contact information
# Business criticality weights (1-10 scale)
# Higher weights indicate more business-critical data requiring stronger governance
importance_weights:
SALES: 10.0 # Highest priority - revenue generating
CUSTOMER_DATA: 9.0 # High priority - customer relationships
HR: 9.0 # High priority - employee information
MANUFACTURING: 8.0 # Important - operational data
PROCUREMENT: 7.0 # Important - vendor relationships
DEFAULT: 6.0 # Standard priority for unmapped schemas
# =============================================================================
# ADVANCED ANALYTICS CONFIGURATION
# =============================================================================
# Controls advanced data quality analysis and usage pattern tracking
analytics:
# Feature enablement
enable_advanced_analytics: true # Enable comprehensive data quality analysis
# Data quality assessment thresholds (0.0-1.0 scale)
quality_threshold: 0.8 # Minimum quality score for "good" data
completeness_threshold: 0.9 # Minimum completeness for "complete" data
# Usage pattern tracking and analysis
track_usage_patterns: true # Monitor and analyze data access patterns
usage_retention_days: 365 # Days of usage history to retain for analysis