Files
backend/api/management/commands/migrate_to_minio.py

472 lines
17 KiB
Python

import os
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from django.core.management.base import BaseCommand
from django.conf import settings
from minio import Minio
from api.record.models import Document
from api.datastage.models import DataStage
from api.vucem.models import Vucem
from api.reports.models import ReportDocument
class Command(BaseCommand):
help = 'Migra archivos existentes del sistema local a MinIO (versión optimizada)'
def add_arguments(self, parser):
parser.add_argument('--dry-run', action='store_true', help='Solo muestra lo que se migraría')
parser.add_argument('--model', type=str, help='Document, DataStage, Vucem, ReportDocument')
parser.add_argument('--limit', type=int, help='Límite de registros')
parser.add_argument('--batch-size', type=int, default=200, help='Tamaño del lote (default: 200)')
parser.add_argument('--workers', type=int, default=3, help='Número de workers (default: 3)')
parser.add_argument('--offset', type=int, default=0, help='Offset inicial (para reanudar)')
def __init__(self):
super().__init__()
self.client = None
self.bucket_name = None
def _init_minio_client(self):
"""Inicializa el cliente MinIO"""
if self.client is None:
self.client = Minio(
endpoint=os.getenv('MINIO_ENDPOINT', 'minio:9000'),
access_key=os.getenv('MINIO_ACCESS_KEY'),
secret_key=os.getenv('MINIO_SECRET_KEY'),
secure=os.getenv('MINIO_SECURE', 'false').lower() == 'true'
)
self.bucket_name = os.getenv('MINIO_BUCKET_NAME', 'efc-backend-dev')
def handle(self, *args, **options):
dry_run = options.get('dry_run', False)
model_filter = options.get('model')
limit = options.get('limit')
batch_size = options.get('batch_size', 200)
workers = options.get('workers', 3)
offset = options.get('offset', 0)
self.stdout.write(self.style.WARNING('=' * 60))
self.stdout.write(self.style.WARNING('INICIANDO MIGRACIÓN A MINIO (OPTIMIZADA)'))
self.stdout.write(self.style.WARNING(f'Batch size: {batch_size} | Workers: {workers} | Offset: {offset}'))
if dry_run:
self.stdout.write(self.style.WARNING('MODO: DRY RUN (sin cambios)'))
self.stdout.write(self.style.WARNING('=' * 60))
results = {}
if not model_filter or model_filter.lower() == 'document':
results['Document'] = self.migrate_documents(dry_run, limit, batch_size, workers, offset)
if not model_filter or model_filter.lower() == 'datastage':
results['DataStage'] = self.migrate_datastage(dry_run, limit, batch_size, workers, offset)
if not model_filter or model_filter.lower() == 'vucem':
results['Vucem'] = self.migrate_vucem(dry_run, limit, workers)
if not model_filter or model_filter.lower() == 'reportdocument':
results['ReportDocument'] = self.migrate_reports(dry_run, limit, batch_size, workers, offset)
# Resumen final
self.stdout.write('\n' + '=' * 60)
self.stdout.write(self.style.SUCCESS('RESUMEN DE MIGRACIÓN'))
self.stdout.write('=' * 60)
total_migrados = 0
total_no_encontrados = 0
total_errores = 0
for model_name, stats in results.items():
self.stdout.write(f"\n📁 {model_name}:")
self.stdout.write(f" ✅ Migrados: {stats['migrated']}")
self.stdout.write(f" ⚠️ No encontrados: {stats['not_found']}")
self.stdout.write(f" ❌ Errores: {stats['errors']}")
total_migrados += stats['migrated']
total_no_encontrados += stats['not_found']
total_errores += stats['errors']
self.stdout.write('\n' + '-' * 40)
self.stdout.write(f"📊 TOTAL Migrados: {total_migrados}")
self.stdout.write(f"📊 TOTAL No encontrados: {total_no_encontrados}")
self.stdout.write(f"📊 TOTAL Errores: {total_errores}")
if dry_run:
self.stdout.write('\n' + self.style.WARNING('⚠️ MODO DRY RUN - No se realizaron cambios'))
def get_local_file_path(self, path_str):
"""Obtiene la ruta completa del archivo local"""
return Path(settings.MEDIA_ROOT) / path_str
def migrate_documents(self, dry_run, limit, batch_size, workers, offset):
"""Migra documentos del modelo Document"""
self._init_minio_client()
stats = {'migrated': 0, 'not_found': 0, 'errors': 0}
queryset = Document.objects.exclude(archivo='').exclude(archivo__isnull=True)
queryset = queryset.exclude(archivo__startswith='org_')
queryset = queryset.order_by('created_at')
if offset:
queryset = queryset[offset:]
if limit:
queryset = queryset[:limit]
total = queryset.count()
self.stdout.write(f"\n📄 Procesando {total} documentos...")
if total == 0:
return stats
start_time = time.time()
processed = 0
# Procesar en lotes
for batch_start in range(0, total, batch_size):
batch = queryset[batch_start:batch_start + batch_size]
batch_docs = list(batch)
if dry_run:
stats['migrated'] += len(batch_docs)
processed += len(batch_docs)
self._print_progress(processed, total, start_time, stats)
continue
# Preparar items para workers
items = []
for doc in batch_docs:
path_str = str(doc.archivo)
local_path = self.get_local_file_path(path_str)
if not local_path.exists():
stats['not_found'] += 1
continue
pedimento_app = doc.pedimento.pedimento_app if doc.pedimento else 'unknown'
items.append({
'doc': doc,
'local_path': local_path,
'path_str': path_str,
'pedimento_app': pedimento_app
})
# Procesar en paralelo
if items:
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {executor.submit(self._upload_document, item): item for item in items}
for future in as_completed(futures):
result = future.result()
if result['success']:
stats['migrated'] += 1
else:
stats['errors'] += 1
processed += len(batch_docs)
self._print_progress(processed, total, start_time, stats)
total_time = time.time() - start_time
self.stdout.write(f"\n ✅ Completado en {total_time/60:.1f} minutos")
return stats
def _upload_document(self, item):
"""Sube un documento directamente a MinIO"""
try:
doc = item['doc']
local_path = item['local_path']
pedimento_app = item['pedimento_app']
filename = local_path.name
# Generar ruta MinIO
object_name = f"org_{doc.organizacion_id}/documents/{pedimento_app}/{filename}"
# Subir directamente a MinIO
self.client.fput_object(
bucket_name=self.bucket_name,
object_name=object_name,
file_path=str(local_path)
)
# Actualizar base de datos
doc.archivo = object_name
doc.save(update_fields=['archivo'])
return {'success': True, 'doc_id': doc.id}
except Exception as e:
return {'success': False, 'doc_id': doc.id, 'error': str(e)}
def migrate_datastage(self, dry_run, limit, batch_size, workers, offset):
"""Migra archivos del modelo DataStage"""
self._init_minio_client()
stats = {'migrated': 0, 'not_found': 0, 'errors': 0}
queryset = DataStage.objects.exclude(archivo='').exclude(archivo__isnull=True)
queryset = queryset.exclude(archivo__startswith='org_')
queryset = queryset.order_by('created_at')
if offset:
queryset = queryset[offset:]
if limit:
queryset = queryset[:limit]
total = queryset.count()
self.stdout.write(f"\n📦 Procesando {total} archivos DataStage...")
if total == 0:
return stats
start_time = time.time()
processed = 0
for batch_start in range(0, total, batch_size):
batch = queryset[batch_start:batch_start + batch_size]
batch_docs = list(batch)
if dry_run:
stats['migrated'] += len(batch_docs)
processed += len(batch_docs)
self._print_progress(processed, total, start_time, stats)
continue
items = []
for ds in batch_docs:
path_str = str(ds.archivo)
local_path = self.get_local_file_path(path_str)
if not local_path.exists():
stats['not_found'] += 1
continue
items.append({'ds': ds, 'local_path': local_path})
if items:
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {executor.submit(self._upload_datastage, item): item for item in items}
for future in as_completed(futures):
result = future.result()
if result['success']:
stats['migrated'] += 1
else:
stats['errors'] += 1
processed += len(batch_docs)
self._print_progress(processed, total, start_time, stats)
total_time = time.time() - start_time
self.stdout.write(f"\n ✅ Completado en {total_time/60:.1f} minutos")
return stats
def _upload_datastage(self, item):
"""Sube un DataStage directamente a MinIO"""
try:
ds = item['ds']
local_path = item['local_path']
filename = local_path.name
object_name = f"org_{ds.organizacion_id}/datastage/{filename}"
self.client.fput_object(
bucket_name=self.bucket_name,
object_name=object_name,
file_path=str(local_path)
)
ds.archivo = object_name
ds.save(update_fields=['archivo'])
return {'success': True, 'id': ds.id}
except Exception as e:
return {'success': False, 'id': ds.id, 'error': str(e)}
def migrate_vucem(self, dry_run, limit, workers):
"""Migra archivos key y cer del modelo Vucem"""
self._init_minio_client()
stats = {'migrated': 0, 'not_found': 0, 'errors': 0}
queryset = Vucem.objects.all()
if limit:
queryset = queryset[:limit]
total = queryset.count() * 2
self.stdout.write(f"\n🔐 Procesando {queryset.count()} registros VUCEM (key + cer)...")
if total == 0:
return stats
items = []
for vucem in queryset:
if vucem.key and not str(vucem.key).startswith('org_'):
path_str = str(vucem.key)
local_path = self.get_local_file_path(path_str)
if local_path.exists():
items.append({'vucem': vucem, 'local_path': local_path, 'tipo': 'key'})
else:
stats['not_found'] += 1
if vucem.cer and not str(vucem.cer).startswith('org_'):
path_str = str(vucem.cer)
local_path = self.get_local_file_path(path_str)
if local_path.exists():
items.append({'vucem': vucem, 'local_path': local_path, 'tipo': 'cer'})
else:
stats['not_found'] += 1
if dry_run:
stats['migrated'] = len(items)
self.stdout.write(f" 📝 [DRY RUN] Se migrarían {len(items)} archivos")
return stats
if items:
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {executor.submit(self._upload_vucem, item): item for item in items}
for future in as_completed(futures):
result = future.result()
if result['success']:
stats['migrated'] += 1
self.stdout.write(self.style.SUCCESS(f"{result['tipo']} migrado: {result['id']}"))
else:
stats['errors'] += 1
return stats
def _upload_vucem(self, item):
"""Sube un archivo VUCEM directamente a MinIO"""
try:
vucem = item['vucem']
local_path = item['local_path']
tipo = item['tipo']
filename = local_path.name
if tipo == 'key':
object_name = f"org_{vucem.organizacion_id}/vucem_keys/{filename}"
vucem.key = object_name
vucem.save(update_fields=['key'])
else:
object_name = f"org_{vucem.organizacion_id}/vucem_certs/{filename}"
vucem.cer = object_name
vucem.save(update_fields=['cer'])
self.client.fput_object(
bucket_name=self.bucket_name,
object_name=object_name,
file_path=str(local_path)
)
return {'success': True, 'id': vucem.id, 'tipo': tipo}
except Exception as e:
return {'success': False, 'id': vucem.id, 'tipo': tipo, 'error': str(e)}
def migrate_reports(self, dry_run, limit, batch_size, workers, offset):
"""Migra archivos del modelo ReportDocument"""
self._init_minio_client()
stats = {'migrated': 0, 'not_found': 0, 'errors': 0}
queryset = ReportDocument.objects.exclude(file='').exclude(file__isnull=True)
queryset = queryset.exclude(file__startswith='org_')
queryset = queryset.order_by('created_at')
if offset:
queryset = queryset[offset:]
if limit:
queryset = queryset[:limit]
total = queryset.count()
self.stdout.write(f"\n📊 Procesando {total} reportes...")
if total == 0:
return stats
start_time = time.time()
processed = 0
for batch_start in range(0, total, batch_size):
batch = queryset[batch_start:batch_start + batch_size]
batch_docs = list(batch)
if dry_run:
stats['migrated'] += len(batch_docs)
processed += len(batch_docs)
self._print_progress(processed, total, start_time, stats)
continue
items = []
for report in batch_docs:
path_str = str(report.file)
local_path = self.get_local_file_path(path_str)
if not local_path.exists():
stats['not_found'] += 1
continue
items.append({'report': report, 'local_path': local_path})
if items:
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {executor.submit(self._upload_report, item): item for item in items}
for future in as_completed(futures):
result = future.result()
if result['success']:
stats['migrated'] += 1
else:
stats['errors'] += 1
processed += len(batch_docs)
self._print_progress(processed, total, start_time, stats)
total_time = time.time() - start_time
self.stdout.write(f"\n ✅ Completado en {total_time/60:.1f} minutos")
return stats
def _upload_report(self, item):
"""Sube un reporte directamente a MinIO"""
try:
report = item['report']
local_path = item['local_path']
filename = local_path.name
filters = report.filters or {}
org_id = filters.get('organizacion_id', 'unknown')
object_name = f"org_{org_id}/reports/{filename}"
self.client.fput_object(
bucket_name=self.bucket_name,
object_name=object_name,
file_path=str(local_path)
)
report.file = object_name
report.save(update_fields=['file'])
return {'success': True, 'id': report.id}
except Exception as e:
return {'success': False, 'id': report.id, 'error': str(e)}
def _print_progress(self, processed, total, start_time, stats):
"""Imprime el progreso actual"""
elapsed = time.time() - start_time
rate = processed / elapsed if elapsed > 0 else 0
pct = processed * 100 / total if total > 0 else 0
self.stdout.write(
f" 📊 {processed}/{total} ({pct:.1f}%) | "
f"{rate:.0f} docs/seg | "
f"{stats['migrated']} | "
f"⚠️ {stats['not_found']} | "
f"{stats['errors']}"
)