This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| wiki:ai:ml-pipeline-test [2025/06/06 12:58] – ddehamer | wiki:ai:ml-pipeline-test [2025/06/06 13:46] (current) – ddehamer | ||
|---|---|---|---|
| Line 2: | Line 2: | ||
| ===== Purpose ===== | ===== Purpose ===== | ||
| + | |||
| Test deployment of Azure ML Pipeline and look at outputs and download trained model. | Test deployment of Azure ML Pipeline and look at outputs and download trained model. | ||
| ===== Key Things Learned ===== | ===== Key Things Learned ===== | ||
| + | |||
| - When you create train.py and prep.py or any other environment files, they should be stored under ./src in your notebook directory. | - When you create train.py and prep.py or any other environment files, they should be stored under ./src in your notebook directory. | ||
| - You need to understand a bit of what you are trying to have it do as it can't think or make suppositions in a normal way. Garbage in, garbage out. | - You need to understand a bit of what you are trying to have it do as it can't think or make suppositions in a normal way. Garbage in, garbage out. | ||
| ===== Final Code ===== | ===== Final Code ===== | ||
| + | |||
| train.py | train.py | ||
| - | < | + | |
| + | < | ||
| import pandas as pd | import pandas as pd | ||
| import argparse | import argparse | ||
| Line 45: | Line 49: | ||
| prep.py | prep.py | ||
| - | < | + | |
| + | < | ||
| import pandas as pd | import pandas as pd | ||
| import argparse | import argparse | ||
| Line 65: | Line 70: | ||
| main() | main() | ||
| </ | </ | ||
| - | |||
| deployment_script.py | deployment_script.py | ||
| - | < | ||
| + | <code -> | ||
| + | # Step 1: Install SDK | ||
| + | !pip install --quiet --upgrade azure-ai-ml | ||
| + | |||
| + | # Step 2: Imports and MLClient setup | ||
| + | from azure.ai.ml import MLClient, Input, Output, dsl | ||
| + | from azure.identity import DefaultAzureCredential | ||
| + | from azure.ai.ml.entities import Environment, | ||
| + | from azure.ai.ml.constants import AssetTypes | ||
| + | import pandas as pd | ||
| + | import os | ||
| + | from uuid import uuid4 | ||
| + | |||
| + | # Step 3: Connect to workspace | ||
| + | ml_client = MLClient( | ||
| + | DefaultAzureCredential(), | ||
| + | subscription_id=" | ||
| + | resource_group_name=" | ||
| + | workspace_name=" | ||
| + | ) | ||
| + | |||
| + | # Step 4: Create sample data and register it | ||
| + | df = pd.DataFrame({ | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | }) | ||
| + | df.to_csv(" | ||
| + | |||
| + | data_asset = Data( | ||
| + | path=" | ||
| + | type=AssetTypes.URI_FILE, | ||
| + | description=" | ||
| + | name=" | ||
| + | ) | ||
| + | ml_client.data.create_or_update(data_asset) | ||
| + | |||
| + | # Step 5: Create Python scripts | ||
| + | os.makedirs(" | ||
| + | |||
| + | # Leave train.py and prep.py creation to previous steps or user updates | ||
| + | |||
| + | # Step 6: Define Environment | ||
| + | env = Environment( | ||
| + | name=" | ||
| + | image=" | ||
| + | conda_file={ | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | { | ||
| + | " | ||
| + | " | ||
| + | ] | ||
| + | } | ||
| + | ] | ||
| + | } | ||
| + | ) | ||
| + | ml_client.environments.create_or_update(env) | ||
| + | |||
| + | # Step 7: Create components from source | ||
| + | prep_component = CommandComponent( | ||
| + | name=" | ||
| + | description=" | ||
| + | inputs={" | ||
| + | outputs={" | ||
| + | code=" | ||
| + | command=" | ||
| + | environment=env, | ||
| + | compute=" | ||
| + | ) | ||
| + | ml_client.components.create_or_update(prep_component) | ||
| + | |||
| + | # Force new train component to avoid cache issues | ||
| + | train_component = CommandComponent( | ||
| + | name=f" | ||
| + | description=" | ||
| + | inputs={" | ||
| + | outputs={" | ||
| + | code=" | ||
| + | command=" | ||
| + | environment=env, | ||
| + | compute=" | ||
| + | ) | ||
| + | ml_client.components.create_or_update(train_component) | ||
| + | |||
| + | # Step 8: Define pipeline function | ||
| + | @dsl.pipeline(default_compute=" | ||
| + | def ml_pipeline(input_data): | ||
| + | prep_step = prep_component(input_data=input_data) | ||
| + | train_step = train_component(training_data=prep_step.outputs.output_data) | ||
| + | return {" | ||
| + | |||
| + | # Step 9: Submit pipeline with explicit output registration | ||
| + | pipeline_job = ml_pipeline( | ||
| + | input_data=Input(type=AssetTypes.URI_FILE, | ||
| + | ) | ||
| + | |||
| + | # Force Azure ML to track output | ||
| + | pipeline_job.outputs[" | ||
| + | type=AssetTypes.URI_FOLDER, | ||
| + | mode=" | ||
| + | ) | ||
| + | |||
| + | pipeline_job = ml_client.jobs.create_or_update(pipeline_job) | ||
| + | |||
| + | # Step 10: Stream logs | ||
| + | ml_client.jobs.stream(pipeline_job.name) | ||
| </ | </ | ||
| NOTE: This is ran from the Notebook, not from a python script. | NOTE: This is ran from the Notebook, not from a python script. | ||
| + | |||
| + | ===== Explanation of Final Code ===== | ||
| + | |||
| + | ==== ✅ Step 1: Install SDK ==== | ||
| + | |||
| + | This ensures the latest version of the '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 2: Imports and MLClient Setup ==== | ||
| + | |||
| + | This step loads all necessary modules from the Azure ML SDK ('' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 3: Connect to Workspace ==== | ||
| + | |||
| + | Here, the '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 4: Create and Register Sample Data ==== | ||
| + | |||
| + | This step uses '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 5: Create Python Scripts ==== | ||
| + | |||
| + | This step creates a folder called '' | ||
| + | |||
| + | * '' | ||
| + | * '' | ||
| + | |||
| + | These scripts are necessary because Azure ML pipelines use self-contained, | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 6: Define Environment ==== | ||
| + | |||
| + | This step defines a custom environment ('' | ||
| + | |||
| + | The environment is registered in Azure ML and then reused in both the prep and training components. This decouples dependency management from the code logic and avoids issues from differing environments between local and remote runs. | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 7: Create Components ==== | ||
| + | |||
| + | Two reusable components are defined using '' | ||
| + | |||
| + | * '' | ||
| + | * '' | ||
| + | |||
| + | Both components reference the '' | ||
| + | |||
| + | Importantly, | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 8: Define Pipeline Function ==== | ||
| + | |||
| + | This defines the actual **DSL pipeline** using the '' | ||
| + | |||
| + | * The raw '' | ||
| + | * The output of '' | ||
| + | * The pipeline **returns '' | ||
| + | |||
| + | This encapsulates the ML process in a declarative, | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 9: Submit Pipeline Job ==== | ||
| + | |||
| + | This constructs a pipeline job from the '' | ||
| + | |||
| + | To ensure the '' | ||
| + | |||
| + | <code -> | ||
| + | pythonCopyEditpipeline_job.outputs[" | ||
| + | </ | ||
| + | |||
| + | This tells Azure ML to persist and expose this output after the run, regardless of whether it's returned by the pipeline function or not. Without this, even correctly written files would not show up in the portal or SDK. | ||
| + | |||
| + | The job is then submitted using '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Step 10: Stream Logs ==== | ||
| + | |||
| + | This line attaches to the running pipeline job and streams logs back to the notebook. It helps monitor progress in real-time and identify any failures as they happen. | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Bonus: Post-Run Output Download (Outside Numbered Steps) ==== | ||
| + | |||
| + | After the job completes, '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Why This Pipeline Matters ==== | ||
| + | |||
| + | This example demonstrates how to build a **modular, reproducible, | ||
| + | |||
| + | * Everything (code, data, environment, | ||
| + | * Each step is independent, | ||
| + | * Logs and artifacts are persisted and inspectable via both SDK and UI | ||
| + | * The pipeline can now be scheduled, automated, and extended | ||
| + | |||
| + | ===== Uses for the output model ===== | ||
| + | |||
| + | ==== 🚀 1. Deploy the Model for Real-Time Inference ==== | ||
| + | |||
| + | ==== Purpose: ==== | ||
| + | |||
| + | Allow other applications (e.g., web apps, mobile apps, services) to query the model in real time via an API. | ||
| + | |||
| + | ==== Implementation: | ||
| + | |||
| + | * Deploy the model using **Azure ML Online Endpoints** | ||
| + | * Wrap it in a scoring script ('' | ||
| + | * Use Azure’s **managed REST API** for secure, scalable access | ||
| + | |||
| + | ==== Example Use Cases: ==== | ||
| + | |||
| + | * Predict customer churn during a support call | ||
| + | * Make fraud detection decisions as a transaction is processed | ||
| + | * Recommend next-best-actions in a CRM interface | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🗃 2. Use the Model for Batch Scoring ==== | ||
| + | |||
| + | === Purpose: === | ||
| + | |||
| + | Process large datasets periodically to generate predictions at scale. | ||
| + | |||
| + | === Implementation: | ||
| + | |||
| + | * Use **Azure ML batch endpoints**, | ||
| + | * Read input from blob storage or a database | ||
| + | * Write predictions back to storage for analysis or ingestion into other systems | ||
| + | |||
| + | === Example Use Cases: === | ||
| + | |||
| + | * Score all users nightly to update risk profiles | ||
| + | * Predict part failures across all equipment in a factory | ||
| + | * Run loan approval predictions across pending applications | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🧪 3. Evaluate and Explain the Model ==== | ||
| + | |||
| + | === Purpose: === | ||
| + | |||
| + | Ensure the model is fair, explainable, | ||
| + | |||
| + | === Tools: === | ||
| + | |||
| + | * **Responsible AI Dashboard** for fairness, explanation, | ||
| + | * **SHAP or LIME** for feature importance | ||
| + | * **Model metrics dashboards** for precision, recall, ROC, etc. | ||
| + | |||
| + | === Example Use Cases: === | ||
| + | |||
| + | * Validate that your loan approval model isn’t biased against a demographic group | ||
| + | * Provide per-prediction feature attributions for compliance | ||
| + | * Tune decision thresholds based on business objectives | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🔐 4. Embed the Model in a Business Workflow ==== | ||
| + | |||
| + | === Purpose: === | ||
| + | |||
| + | Integrate predictions into real-time or batch operational systems to drive action. | ||
| + | |||
| + | === Integration Options: === | ||
| + | |||
| + | * Azure Functions or Logic Apps (real-time triggers) | ||
| + | * Azure Data Factory or Synapse pipelines (batch workflows) | ||
| + | * Event Grid / Event Hub for prediction-driven messaging | ||
| + | |||
| + | === Example Use Cases: === | ||
| + | |||
| + | * Auto-assign support tickets based on urgency prediction | ||
| + | * Escalate flagged transactions to fraud review team | ||
| + | * Enqueue predicted high-risk patients into care follow-up workflow | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🛡 5. Monitor and Manage the Model in Production ==== | ||
| + | |||
| + | === Purpose: === | ||
| + | |||
| + | Ensure the model performs well over time as real-world data changes. | ||
| + | |||
| + | === Actions: === | ||
| + | |||
| + | * Monitor prediction drift and data quality with **Azure ML Data Monitor** | ||
| + | * Set up retraining pipelines if performance drops | ||
| + | * Use **MLflow** or Azure model registry to version models and manage lifecycles | ||
| + | |||
| + | === Example Use Cases: === | ||
| + | |||
| + | * Detect concept drift in customer behavior post-promotion | ||
| + | * Auto-retrain recommendation model every 2 weeks | ||
| + | * Compare performance of two deployed model versions (A/B testing) | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🔁 6. Retrain or Fine-Tune the Model ==== | ||
| + | |||
| + | === Purpose: === | ||
| + | |||
| + | Keep the model up-to-date with fresh data, domain changes, or new features. | ||
| + | |||
| + | === Strategies: === | ||
| + | |||
| + | * Use a scheduled pipeline to retrain with new labeled data | ||
| + | * Add new features or tune hyperparameters | ||
| + | * Replace the model with an upgraded architecture (e.g., switching from logistic regression to XGBoost) | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🧠 Real-World Examples by Industry ==== | ||
| + | |||
| + | ^ Industry ^ Use of '' | ||
| + | | Finance | Credit risk scoring, fraud detection | | ||
| + | | Retail | Product recommendation, | ||
| + | | Healthcare | Diagnosis support, patient readmission risk | | ||
| + | | Manufacturing | Predictive maintenance, | ||
| + | | Logistics | Delivery delay prediction, route optimization | | ||
| + | | Cybersecurity | Threat classification, | ||
| + | |||
| + | ===== Reusability ===== | ||
| + | |||
| + | ===== ✅ Reusable As-Is If: ===== | ||
| + | |||
| + | You are solving **the same kind of problem** (e.g., binary classification using logistic regression) and the following stay consistent: | ||
| + | |||
| + | * **Input data structure**: | ||
| + | * '' | ||
| + | * **Preprocessing logic**: You still just sum '' | ||
| + | * **Model type**: You're still using a '' | ||
| + | * **Output format**: You expect the model to be saved as '' | ||
| + | |||
| + | ==== In this case: ==== | ||
| + | |||
| + | ✅ You only need to change the **CSV file** and re-register it as a new version of '' | ||
| + | |||
| + | <code -> | ||
| + | pythonCopyEditinput_data=Input(type=AssetTypes.URI_FILE, | ||
| + | </ | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🔄 Requires Changes If: ==== | ||
| + | |||
| + | Your pipeline needs to be adapted for a different data structure or task. Here’s when you'd need to modify the scripts: | ||
| + | |||
| + | === 🔁 If your data columns change: === | ||
| + | |||
| + | * You'll need to update: | ||
| + | * '' | ||
| + | * '' | ||
| + | * Possibly retrain on different targets (multi-class, | ||
| + | |||
| + | === 🔁 If your model type changes: === | ||
| + | |||
| + | * If you switch from '' | ||
| + | * Update '' | ||
| + | * Possibly adjust hyperparameters and training logic | ||
| + | |||
| + | === 🔁 If your pipeline steps change: === | ||
| + | |||
| + | * Want to add validation? | ||
| + | * Want to split data into train/test? | ||
| + | * Want to evaluate model metrics? | ||
| + | * You’ll need new component scripts and return more outputs (e.g., '' | ||
| + | |||
| + | === 🔁 If your deployment format changes: === | ||
| + | |||
| + | * If your consumers expect ONNX or TensorFlow SavedModel instead of '' | ||
| + | * Serialize the model differently | ||
| + | * Possibly update the pipeline to convert formats | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== 🧰 To Make it Highly Reusable: ==== | ||
| + | |||
| + | You can make the pipeline truly production-grade and reusable by: | ||
| + | |||
| + | ^ Feature ^ How to Do It ^ | ||
| + | | Parametrize column names | Add '' | ||
| + | | Generalize preprocessing | Add preprocessing config file or flags | | ||
| + | | Model selector | Add '' | ||
| + | | Versioned output naming | Return '' | ||
| + | | Dynamic data input | Register new data via CLI, UI, or pipeline parameter | | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ==== ✅ Summary ==== | ||
| + | |||
| + | ^ Scenario ^ Reusable? ^ What to Change ^ | ||
| + | | Same data structure and model type | ✅ | Just update the input dataset version | | ||
| + | | Same structure, different model | 🔁 | Modify '' | ||
| + | | Different data columns or prediction target | 🔁 | Modify '' | ||
| + | | More complex workflow (e.g., evaluation, deployment) | 🔁 | Add steps and new component scripts | | ||
| + | |||
| + | ===== How to Deploy Model ===== | ||
| + | |||
| + | ==== ✅ High-Level Overview ==== | ||
| + | |||
| + | - **Prepare Scoring Script ('' | ||
| + | - **Create Inference Environment** | ||
| + | - **Register the Trained Model** | ||
| + | - **Create an Online Endpoint** | ||
| + | - **Deploy the Model to the Endpoint** | ||
| + | - **Test the Deployed Service** | ||
| + | |||
| + | ====== Errors Encountered During Session ====== | ||
| + | |||
| + | ===== 🔁 Environment Definition Issue ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | The '' | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== 🔁 Dataset Reference Issue ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | When submitting the pipeline, Azure ML failed to resolve the dataset because the dataset path was given as ''" | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The dataset path was updated to use the full Azure ML URI syntax: ''" | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== 🔁 Output Not Persisted ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | Even though the '' | ||
| + | |||
| + | ==== ✅ Root Cause: ==== | ||
| + | |||
| + | The output directory was not explicitly registered in the pipeline job, and Azure ML silently discarded it. | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The pipeline job was updated to explicitly register '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== 🔁 Missing Script Execution ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | The '' | ||
| + | |||
| + | ==== ✅ Root Cause: ==== | ||
| + | |||
| + | The wrong '' | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The correct file (''/ | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== 🔁 Scoping Error in train.py ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | Print statements accessing '' | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The logging and '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== 🔁 Model Download Error ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | An attempt to use the '' | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The '' | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== 🔁 Silent Step Failure Due to Typo ===== | ||
| + | |||
| + | ==== ❌ Problem: ==== | ||
| + | |||
| + | The dataset path was mistyped as ''" | ||
| + | |||
| + | ==== ✅ Solution: ==== | ||
| + | |||
| + | The typo was corrected, and the step executed normally once a valid dataset path was provided. | ||
| + | |||
| + | ---- | ||
| + | |||
| + | ===== ✅ Final Outcome ===== | ||
| + | |||
| + | After resolving these issues: | ||
| + | |||
| + | * The pipeline executed end-to-end | ||
| + | * The model output was persisted and downloadable | ||
| + | * Logs confirmed proper script execution | ||
| + | * The deployment strategy was outlined, ready for API-based use | ||
| [[ai_knowledge|AI Knowledge]] | [[ai_knowledge|AI Knowledge]] | ||
| + | |||
| + | |||