L
Initializing Studio...
Learn how to manage and work with datasets in LangTrain for optimal model training.
1# Load dataset from various sources2from langtrain import Dataset34# From CSV5dataset = Dataset.from_csv('data.csv',6 text_column='text',7 label_column='label')89# From JSON10dataset = Dataset.from_json('data.jsonl')1112# From HuggingFace13dataset = Dataset.from_huggingface('imdb')1415# Custom preprocessing16dataset = Dataset.from_custom(17 path='custom_data/',18 preprocessor=custom_preprocessor19)
1# Data preprocessing pipeline2dataset = dataset.preprocess([3 # Text cleaning4 dataset.clean_text(remove_urls=True, remove_special=True),56 # Tokenization7 dataset.tokenize(tokenizer='bert-base-uncased', max_length=512),89 # Data augmentation10 dataset.augment(techniques=['synonym_replacement', 'back_translation']),1112 # Train/validation split13 dataset.split(train_size=0.8, stratify=True)14])1516# Custom preprocessing function17def custom_preprocess(batch):18 batch['text'] = [text.lower().strip() for text in batch['text']]19 return batch2021dataset = dataset.map(custom_preprocess, batched=True)
1# Data quality analysis2quality_report = dataset.analyze_quality()3print(quality_report.summary())45# Validation checks6dataset.validate([7 'check_missing_values',8 'check_label_distribution',9 'check_text_length',10 'check_duplicates'11])1213# Automatic data cleaning14dataset = dataset.clean(15 remove_duplicates=True,16 handle_missing='drop',17 min_text_length=10,18 max_text_length=100019)
1# Version your datasets2dataset.save_version('v1.0', description='Initial dataset')34# Load specific version5dataset = Dataset.load_version('my_dataset', version='v1.0')67# Compare versions8comparison = Dataset.compare_versions('my_dataset', 'v1.0', 'v1.1')9print(comparison.statistics())