L
Initializing Studio...
Understand model evaluation metrics and best practices for measuring performance.
1from langtrain import Evaluator23# Initialize evaluator4evaluator = Evaluator(task_type='text_classification')56# Built-in metrics7results = evaluator.evaluate(8 model=model,9 test_data=test_dataset,10 metrics=['accuracy', 'f1_score', 'precision', 'recall']11)1213print(f"Accuracy: {results['accuracy']:.4f}")14print(f"F1-Score: {results['f1_score']:.4f}")1516# For text generation tasks17gen_evaluator = Evaluator(task_type='text_generation')18gen_results = gen_evaluator.evaluate(19 model=model,20 test_data=test_dataset,21 metrics=['bleu', 'rouge', 'bert_score']22)
1# Define custom evaluation metric2def custom_domain_accuracy(predictions, labels, domain_weights):3 """Custom metric that weights accuracy by domain importance"""4 correct = 05 total_weight = 067 for pred, label, weight in zip(predictions, labels, domain_weights):8 if pred == label:9 correct += weight10 total_weight += weight1112 return correct / total_weight if total_weight > 0 else 01314# Register custom metric15evaluator.register_metric('domain_accuracy', custom_domain_accuracy)1617# Use in evaluation18results = evaluator.evaluate(19 model=model,20 test_data=test_dataset,21 metrics=['accuracy', 'domain_accuracy'],22 metric_params={'domain_accuracy': {'domain_weights': weights}}23)
1# Cross-validation evaluation2from langtrain.evaluation import CrossValidator34cv = CrossValidator(5 folds=5,6 stratified=True,7 random_state=428)910cv_results = cv.evaluate(11 model=model,12 data=dataset,13 metrics=['accuracy', 'f1_score']14)1516print(f"CV Accuracy: {cv_results['accuracy'].mean():.4f} ± {cv_results['accuracy'].std():.4f}")1718# Temporal split for time-series data19from langtrain.evaluation import TemporalSplit2021temporal_split = TemporalSplit(22 train_size=0.7,23 val_size=0.15,24 test_size=0.15,25 time_column='timestamp'26)2728train, val, test = temporal_split.split(dataset)2930# Evaluate on temporal test set31temporal_results = evaluator.evaluate(32 model=model,33 test_data=test,34 metrics=['accuracy', 'f1_score']35)
1# Compare multiple models2from langtrain.evaluation import ModelComparator34comparator = ModelComparator(5 models=[model1, model2, model3],6 model_names=['BERT', 'RoBERTa', 'DistilBERT']7)89comparison_results = comparator.compare(10 test_data=test_dataset,11 metrics=['accuracy', 'f1_score', 'inference_time'],12 statistical_test='mcnemar' # McNemar's test for significance13)1415# Generate comparison report16comparator.generate_report(17 results=comparison_results,18 output_path='model_comparison_report.html',19 include_plots=True20)2122print(comparison_results.summary())
1# Continuous evaluation setup2from langtrain.evaluation import ContinuousEvaluator34continuous_eval = ContinuousEvaluator(5 model=model,6 evaluation_schedule='daily',7 alert_thresholds={8 'accuracy': 0.85, # Alert if accuracy drops below 85%9 'f1_score': 0.8010 }11)1213# Monitor data drift14continuous_eval.enable_drift_detection(15 reference_data=training_data,16 drift_threshold=0.117)1819# Set up alerts20continuous_eval.configure_alerts(21 email=['team@company.com'],22 slack_webhook='https://hooks.slack.com/...'23)2425# Start monitoring26continuous_eval.start()