Data Discovery
Last updated
Was this helpful?
Last updated
Was this helpful?
Was this helpful?
sudo curl -s "https://get.sdkman.io" | bashsource "/home/pdc/.sdkman/bin/sdkman-init.sh"sdk helpsdk install schemacrawlerschemacrawler.sh \
--server=sqlserver \
--host=pdc.pentaho.lab \
--port=1433 \
--database=AdventureWorks2022 \
--user=sa \
--password=StrongPassword123 \
--info-level=maximum \
--command=schema \
--schemas=".*\.(HumanResources|Person|Production|Purchasing|Sales)" \
--output-format=png \
--output-file=adventureworks2022_all_schemas.pngschemacrawler.sh \
--server=sqlserver \
--host=pdc.pentaho.lab \
--port=1433 \
--database=AdventureWorks2022 \
--user=sa \
--password=StrongPassword123 \
--info-level=maximum \
--command=schema \
--schemas=".*\.HumanResources" \
--output-format=png \
--output-file=adventureworks2022_hr_schema.png# Person schema only
--schemas=".*\.Person"
--output-file=adventureworks2022_person_schema.png
# Production schema only
--schemas=".*\.Production"
--output-file=adventureworks2022_production_schema.png
# Sales schema only
--schemas=".*\.Sales"
--output-file=adventureworks2022_sales_schema.png
# Purchasing schema only
--schemas=".*\.Purchasing"
--output-file=adventureworks2022_purchasing_schema.png# HTML report
--output-format=html --output-file=schema.html
# JSON output
--output-format=json --output-file=schema.json
# CSV data
--output-format=csv --output-file=schema.csv
# Text report
--output-format=text --output-file=schema.txt
# SVG diagram (vector graphics)
--output-format=svg --output-file=schema.svg
# PDF diagram
--output-format=pdf --output-file=schema.pdf# Basic schema info
--command=brief
# Detailed schema with all metadata
--command=details
# List all objects
--command=list# Count rows in all tables
--command=count --output-file=row_counts.txt
# Dump sample data from tables
--command=dump --output-file=sample_data.txt
# Quick data dump (faster, no ordering)
--command=quickdump --output-file=quick_data.txt# Find schema design issues
--command=lint --output-file=schema_lint_report.txt# Generate comprehensive HTML documentation
schemacrawler.sh \
--server=sqlserver \
--host=pdc.pentaho.lab \
--port=1433 \
--database=AdventureWorks2022 \
--user=sa \
--password=StrongPassword123 \
--info-level=maximum \
--command=details \
--schemas=".*\.(HumanResources|Person|Production|Purchasing|Sales)" \
--output-format=html \
--output-file=adventureworks_complete_docs.html# Find potential schema issues
schemacrawler.sh \
--server=sqlserver \
--host=pdc.pentaho.lab \
--port=1433 \
--database=AdventureWorks2022 \
--user=sa \
--password=StrongPassword123 \
--info-level=maximum \
--command=lint \
--schemas=".*\.(HumanResources|Person|Production|Purchasing|Sales)" \
--output-file=schema_quality_report.txtchmod +x /home/pdc/Workshop--Data-Catalog/Projects/data-discovery/setup.sh# Run the setup script
cd
cd ~/Workshop-Data-Catalog/Projects/data-discovery
./setup.shcd
cd ~/Projects/Data_Discovery
# Configure environment
nano .env # Update database credentials
# Download JDBC drivers
# Place them in ./drivers/ directory
# Update configuration (if needed)
nano config/config.yaml# Environment Variables for Data Discovery
# Copy this file to .env and update with your actual values
# AdventureWorks Database Connection
AW_HOST=localhost
AW_PORT=1433
AW_DATABASE=AdventureWorks2022
AW_USERNAME=sa
AW_PASSWORD=StrongPassword123
# General Database Connection (for other databases)
DB_HOST=localhost
DB_PORT=1433
DB_DATABASE=AdventureWorks2022
DB_USERNAME=sa
DB_PASSWORD=StrongPassword123
DB_DRIVER=sqlserver
# JDBC Driver Path (optional)
JDBC_DRIVER_PATH=./drivers/
# Output Configuration
OUTPUT_DIR=./reports
LOG_DIR=./logs
# API Keys (if needed for external services)
# ENCRYPTION_KEY=your_encryption_key
# API_KEY=your_api_key# =============================================================================
# Data Discovery Configuration File
# =============================================================================
# Complete configuration matching README.md documentation
# Update with your specific database and environment settings
#
# This file controls all aspects of the Data Discovery application including:
# - Database connectivity and performance
# - PII classification rules and thresholds
# - Trust scoring methodology (Industry standard)
# - Dashboard behavior and caching
# - Business domain mapping and analytics
# =============================================================================
# =============================================================================
# DATABASE CONFIGURATION
# =============================================================================
# Controls how the application connects to and interacts with your database
database:
# Basic connection settings - Update these for your database
host: "localhost" # Database server hostname or IP
port: 1433 # Database port (1433 for SQL Server)
database: "AdventureWorks2022" # Database name to analyze
username: "sa" # Database username
password: "${DB_PASSWORD}" # Use environment variable for security
driver: "sqlserver" # Database type: sqlserver, postgresql, mysql, oracle
# Connection management and timeouts
connection_timeout: 30 # Seconds to wait for initial connection
query_timeout: 300 # Seconds to wait for query completion (5 minutes)
max_connections: 10 # Maximum concurrent database connections
# Performance optimization for large databases
batch_size: 1000 # Number of tables to process in each batch
parallel_workers: 4 # Number of parallel processing threads
# JDBC-specific connection parameters
jdbc_url: null # Custom JDBC URL (overrides host/port if set)
additional_params:
TrustServerCertificate: "true" # Accept self-signed certificates
IntegratedSecurity: "false" # Use SQL Server authentication
# =============================================================================
# DISCOVERY CONFIGURATION
# =============================================================================
# Controls what data is discovered and how the analysis is performed
discovery:
# Database schemas to include in discovery
# Leave empty ([]) to discover all schemas, or specify specific ones
schemas:
- "Sales" # Sales and customer transactions
- "Production" # Manufacturing and inventory data
- "HumanResources" # Employee and HR information
- "Purchasing" # Vendor and procurement data
- "Person" # Customer and contact information
# Data sampling for analysis (affects performance vs. accuracy)
sample_size: 100 # Number of rows to sample per table for pattern analysis
# Usage pattern analysis timeframe
usage_analysis_days: 30 # Days of historical usage data to analyze
# =============================================================================
# CLASSIFICATION CONFIGURATION
# =============================================================================
# Controls PII detection and data classification using Pentaho methodology
classification:
# Classification accuracy vs. performance trade-off
confidence_threshold: 0.7 # Minimum confidence score (0.0-1.0) to classify as PII
# Rule engine settings
enable_custom_rules: true # Enable user-defined classification patterns
enable_adventureworks_rules: true # Enable AdventureWorks-specific classification rules
# Pentaho standard PII categories to detect
# These align with industry-standard privacy frameworks (GDPR, CCPA, etc.)
pii_categories:
- "GOVERNMENT_ID" # SSN, passport, national ID, tax ID
- "PERSONAL_NAME" # First, last, full names, display names
- "CONTACT_INFO" # Email addresses, phone numbers
- "ADDRESS_INFO" # Street, city, zip, postal codes
- "DEMOGRAPHIC" # Birth date, gender, age, ethnicity
- "FINANCIAL" # Salary, credit cards, bank accounts
- "HEALTH" # Medical records, diagnosis, treatment
- "AUTHENTICATION" # Passwords, secrets, tokens
- "BIOMETRIC" # Fingerprints, photos, biometric data
# Performance optimization
cache_results: true # Cache classification results to improve performance
# Custom classification patterns
custom_patterns:
- name: "Custom SSN Pattern"
category: "GOVERNMENT_ID"
sensitivity: "HIGH"
column_patterns: ["ssn", "social_security"]
data_patterns: ["^\\d{3}-?\\d{2}-?\\d{4}$"]
- name: "Email Pattern"
category: "CONTACT_INFO"
sensitivity: "MEDIUM"
column_patterns: ["email", "mail"]
data_patterns: ["^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$"]
# =============================================================================
# TRUST SCORING CONFIGURATION (Industry Methodology)
# =============================================================================
# Implements Industry-standard trust scoring methodology
# Reference: https://academy.pentaho.com/pentaho-data-catalog/use-case/adventure-works/
trust_scoring:
# Industry methodology component weights (must sum to 1.0)
# These weights determine how different factors contribute to the overall trust score
business_value_weight: 0.30 # Business importance and usage (30%)
data_type_weight: 0.25 # Data type appropriateness and consistency (25%)
constraints_weight: 0.25 # Structural integrity and constraints (25%)
naming_weight: 0.20 # Naming conventions and descriptiveness (20%)
# Trust score classification thresholds (0-100 scale)
excellent_threshold: 75 # Excellent governance (75-100) - Green
good_threshold: 65 # Good governance (65-74) - Light Green
fair_threshold: 50 # Fair governance (50-64) - Yellow
# Below 50 = Poor governance - Red
# Risk level assessment thresholds
critical_threshold: 40 # Critical risk (0-39) - Immediate attention required
high_threshold: 60 # High risk (40-59) - Priority remediation
medium_threshold: 75 # Medium risk (60-74) - Monitor and improve
# Above 75 = Low risk - Well governed
# =============================================================================
# DASHBOARD CONFIGURATION
# =============================================================================
# Controls the web dashboard behavior, performance, and user experience
dashboard:
# Caching settings for improved performance
enable_caching: true # Enable server-side caching of dashboard data
cache_duration: 300 # Cache lifetime in seconds (5 minutes)
# User interface display limits
max_results_per_page: 100 # Maximum items per page in filtered views
max_records_display: 10000 # Maximum total records to display in UI
# Performance optimization
enable_compression: true # Compress API responses to reduce bandwidth
# =============================================================================
# COMPLIANCE CONFIGURATION
# =============================================================================
# Defines regulatory frameworks and compliance assessment parameters
compliance:
# Regulatory frameworks to assess against
# The system will evaluate data handling practices against these standards
regulations:
- "GDPR" # General Data Protection Regulation (EU)
- "CCPA" # California Consumer Privacy Act (US)
- "HIPAA" # Health Insurance Portability and Accountability Act (US)
- "SOX" # Sarbanes-Oxley Act (US)
- "PCI_DSS" # Payment Card Industry Data Security Standard
# Compliance monitoring schedule
assessment_frequency: 90 # Days between automated compliance assessments
# =============================================================================
# OUTPUT CONFIGURATION
# =============================================================================
# Controls report generation, file formats, and export behavior
output:
# File system settings
directory: "./reports" # Directory for generated reports and exports
# Supported export formats
formats:
- "csv" # Comma-separated values (data analysis)
- "json" # JavaScript Object Notation (API integration)
- "xlsx" # Excel workbook (business users)
- "pdf" # Portable Document Format (executive reports)
# Data inclusion settings
include_samples: true # Include sample data values in classification reports
# File naming convention
timestamp_format: "%Y%m%d_%H%M%S" # Format: YYYYMMDD_HHMMSS
# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================
# Controls application logging, debugging, and troubleshooting capabilities
logging:
# Log verbosity level
# DEBUG: Detailed diagnostic information
# INFO: General operational messages (recommended)
# WARNING: Important warnings and issues
# ERROR: Only error conditions
level: "INFO"
# Log file location
file: "./logs/data_discovery.log" # Main application log file
# Log rotation to prevent disk space issues
max_file_size: "10MB" # Maximum size before rotation
backup_count: 5 # Number of backup log files to keep
# =============================================================================
# BUSINESS DOMAIN MAPPING
# =============================================================================
# Maps database schemas to business domains and defines their relative importance
business_domains:
# Schema to business domain classification
# This mapping helps contextualize data within business functions
schema_mappings:
Sales: "SALES" # Sales transactions and customer relationships
Production: "MANUFACTURING" # Manufacturing processes and inventory
HumanResources: "HR" # Employee data and human resources
Purchasing: "PROCUREMENT" # Vendor relationships and procurement
Person: "CUSTOMER_DATA" # Customer profiles and contact information
# Business criticality weights (1-10 scale)
# Higher weights indicate more business-critical data requiring stronger governance
importance_weights:
SALES: 10.0 # Highest priority - revenue generating
CUSTOMER_DATA: 9.0 # High priority - customer relationships
HR: 9.0 # High priority - employee information
MANUFACTURING: 8.0 # Important - operational data
PROCUREMENT: 7.0 # Important - vendor relationships
DEFAULT: 6.0 # Standard priority for unmapped schemas
# =============================================================================
# ADVANCED ANALYTICS CONFIGURATION
# =============================================================================
# Controls advanced data quality analysis and usage pattern tracking
analytics:
# Feature enablement
enable_advanced_analytics: true # Enable comprehensive data quality analysis
# Data quality assessment thresholds (0.0-1.0 scale)
quality_threshold: 0.8 # Minimum quality score for "good" data
completeness_threshold: 0.9 # Minimum completeness for "complete" data
# Usage pattern tracking and analysis
track_usage_patterns: true # Monitor and analyze data access patterns
usage_retention_days: 365 # Days of usage history to retain for analysiscd
cd ~/Projects/Data-Discovery
./install.shcurl -s http://localhost:6001/api/health | python3 -c "import json, sys; data = json.load(sys.stdin); print('Dashboard Status:', data.get('status', 'Unknown'))"echo "=== COMPREHENSIVE SYSTEM VERIFICATION ===" && curl -s http://localhost:6001/api/analytics | python3 -c "import json, sys; data = json.load(sys.stdin); print(' Analytics API: ✅ Working'); print(' Total Tables:', data.get('inventory', {}).get('total_tables', 'N/A')); print(' Total Columns:', data.get('classification', {}).get('total_columns', 'N/A')); print(' Classified Columns:', data.get('classification', {}).get('classified_columns', 'N/A')); print('⭐ Database Trust Score:', data.get('database_trust_score', 'N/A')); print(' Governance Level:', data.get('governance_level', 'N/A')); print(' Trust Score Levels:', len(data.get('trust_scores', {}))); print('️ Classification Categories:', len(data.get('classification', {}).get('category_distribution', {}))); print('✅ DAMA-DMBOK Integration: COMPLETE')"cd
cd ~/Projects/Data_Discovery
python3 dashboard/server.py --port 6001bash -lc 'for i in {1..20}; do STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:6001/healthz || true); if [ "$STATUS" = "200" ]; then echo READY; exit 0; fi; sleep 0.5; done; echo NOT_READY'PID=$(lsof -iTCP:6001 -sTCP:LISTEN -t 2>/dev/null || true); if [ -n "$PID" ]; then echo "Killing PID $PID (port 6001)"; kill $PID || true; sleep 1; fi; echo "Done."