Skip to main content

Smart Excel Import in Python: Flexible Column Detection and Heterogeneous Data Cleaning

Rogelio Guerra Riverón
Author
Rogelio Guerra Riverón
Building my own web infrastructure from scratch. Here I document each step: servers, networks, containers and everything that comes along.

I’ve been working with Excel spreadsheets that arrive from different departments. Each one uses different column names, the data is dirty (phone numbers with notes, tax IDs mixed with text), and codes have inconsistent formats. Here I document the solution I built.

The real problem
#

I was receiving Excel files where:

  • Columns called “NIF” in one, “CIF” in another, “Identification” in the third
  • Phone numbers like “123-456-7890 (ext 5)”, “9876543210 - unavailable”
  • Tax IDs with dashes, spaces and varied letters
  • Product codes with inconsistent prefixes

I couldn’t expect everyone to format the same way. I needed a system that was flexible.

Solution architecture
#

My approach uses three layers:

  1. Column detection by regex (finds “nif”, “cif”, “identification” with fuzzy matching)
  2. Specialized cleaners for each data type
  3. Validation and logging for audit

Step-by-step implementation
#

1. Load the file and detect columns
#

from openpyxl import load_workbook
import re

def detectar_columnas(archivo_excel):
    wb = load_workbook(archivo_excel)
    ws = wb.active
    
    encabezados = [cell.value for cell in ws[1]]
    
    mapa_columnas = {
        'nif': detectar_columna(encabezados, r'nif|cif|identificaci'),
        'telefono': detectar_columna(encabezados, r'telef|phone|contacto'),
        'nombre': detectar_columna(encabezados, r'nombre|name|razón'),
        'codigo': detectar_columna(encabezados, r'código|code|articulo')
    }
    
    return wb, ws, mapa_columnas

def detectar_columna(encabezados, patron):
    for idx, encabezado in enumerate(encabezados):
        if encabezado and re.search(patron, str(encabezado).lower()):
            return idx
    return None

2. Clean and validate NIF/CIF
#

class LimpiadorNIF:
    PATRON_NIF = r'^([0-9]{8}[A-Z]|[XYZ][0-9]{7}[A-Z])$'
    
    @staticmethod
    def limpiar(valor):
        if not valor:
            return None
        
        # Extraer solo números y letras
        limpio = re.sub(r'[^A-Z0-9]', '', str(valor).upper())
        
        # Si tiene más de 9 caracteres, puede ser NIF + texto
        match = re.search(r'([0-9]{8}[A-Z]|[XYZ][0-9]{7}[A-Z])', limpio)
        if match:
            return match.group(1)
        
        return None if not re.match(LimpiadorNIF.PATRON_NIF, limpio) else limpio

3. Extract “clean” phone numbers
#

class LimpiadorTelefono:
    @staticmethod
    def limpiar(valor):
        if not valor:
            return None, None
        
        texto = str(valor)
        
        # Extraer solo números (mínimo 9 dígitos)
        numeros = re.sub(r'\D', '', texto)
        telefono = numeros[-9:] if len(numeros) >= 9 else None
        
        # Detectar notas (texto entre paréntesis o después de guiones)
        notas = re.search(r'(\([^)]+\)|-.+)', texto)
        nota = notas.group(1).strip() if notas else None
        
        return telefono, nota

4. Normalize codes with variable prefixes
#

class LimpiadorCodigo:
    @staticmethod
    def limpiar(valor, prefijo_esperado='PRD'):
        if not valor:
            return None
        
        # Convertir a mayúsculas y eliminar espacios
        limpio = str(valor).upper().strip()
        
        # Extraer la parte numérica
        match = re.search(r'([A-Z]*)?(\d+)', limpio)
        if match:
            numero = match.group(2)
            # Garantizar formato consistente
            return f"{prefijo_esperado}{numero.zfill(6)}"
        
        return None

5. Process the complete file
#

def importar_excel(ruta_archivo):
    wb, ws, columnas = detectar_columnas(ruta_archivo)
    
    registros = []
    errores = []
    
    for fila_idx, fila in enumerate(ws.iter_rows(min_row=2, values_only=False), start=2):
        registro = {}
        
        try:
            if columnas['nif'] is not None:
                nif_raw = fila[columnas['nif']].value
                registro['nif'] = LimpiadorNIF.limpiar(nif_raw)
            
            if columnas['telefono'] is not None:
                tel_raw = fila[columnas['telefono']].value
                tel, nota = LimpiadorTelefono.limpiar(tel_raw)
                registro['telefono'] = tel
                registro['nota_telefono'] = nota
            
            if columnas['codigo'] is not None:
                cod_raw = fila[columnas['codigo']].value
                registro['codigo'] = LimpiadorCodigo.limpiar(cod_raw)
            
            registros.append(registro)
        
        except Exception as e:
            errores.append(f"Fila {fila_idx}: {str(e)}")
    
    return registros, errores

Production use
#

registros, errores = importar_excel('ventas_marzo.xlsx')

if errores:
    print(f"⚠️ {len(errores)} errores encontrados:")
    for error in errores[:5]:
        print(f"  - {error}")

print(f"✓ {len(registros)} registros procesados correctamente")

Lessons learned
#

  • Regex is the gold standard for dirty data
  • Always return the original + cleaned data for audit
  • Error logging by row makes debugging easier
  • Flexible column detection saved hours of support

This system has been in production for 6 months. I’ve only had to add 2 more cleaners. The key is keeping each cleaner independent.


Recommended equipment#

Affiliate links. No extra cost to you.