#!
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO JSON Processor
This module processes JSON files for intent generation.
"""
import os
import json
import logging
from collections import Counter
class JSONProcessor:
"""Processes JSON files for intent generation."""
def __init__(self):
"""Initialize the JSON processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None
def process(self, file_path):
"""
Process a JSON file.
Args:
file_path (str): Path to the JSON file
Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing JSON file: {os.path.basename(file_path)}")
self.on_progress(10)
# Read file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.on_progress(30)
# Analyze structure
self.on_status("Analyzing JSON structure...")
structure = self._analyze_structure(data)
self.on_progress(50)
# Extract key information
self.on_status("Extracting key information...")
keys = self._extract_keys(data)
self.on_progress(70)
# Identify potential entities
self.on_status("Identifying potential entities...")
entities = self._identify_entities(data)
self.on_progress(90)
# Combine results
result = {
'structure': structure,
'keys': keys,
'entities': entities,
'data': data # Include the original data
}
self.on_progress(100)
self.on_status("JSON processing complete")
return result
except Exception as e:
logging.error(f"Error processing JSON file: {str(e)}", exc_info=True)
raise
def _analyze_structure(self, data):
"""
Analyze the structure of JSON data.
Args:
data: JSON data
Returns:
dict: Structure information
"""
structure = {
'type': type(data).__name__
}
if isinstance(data, dict):
structure['keys'] = list(data.keys())
structure['num_keys'] = len(data)
# Analyze nested structure (first level only)
nested_types = {}
for key, value in data.items():
nested_types[key] = type(value).__name__
structure['nested_types'] = nested_types
elif isinstance(data, list):
structure['length'] = len(data)
# Analyze item types
if data:
if all(isinstance(item, dict) for item in data):
# If all items are dictionaries, get common keys
common_keys = set.intersection(*[set(item.keys()) for item in
data]) if data else set()
structure['common_keys'] = list(common_keys)
# Get a sample item
structure['sample_item'] = data[0] if data else None
else:
# Otherwise, just note the types
item_types = [type(item).__name__ for item in data[:10]] #
First 10 items
structure['item_types'] = item_types
return structure
def _extract_keys(self, data, prefix=''):
"""
Extract all keys from nested JSON data.
Args:
data: JSON data
prefix (str): Prefix for nested keys
Returns:
list: List of keys
"""
keys = []
if isinstance(data, dict):
for key, value in data.items():
full_key = f"{prefix}.{key}" if prefix else key
keys.append(full_key)
if isinstance(value, (dict, list)):
keys.extend(self._extract_keys(value, full_key))
elif isinstance(data, list) and data:
# For lists, check the first item
if isinstance(data[0], (dict, list)):
keys.extend(self._extract_keys(data[0], prefix + '[0]'))
return keys
def _identify_entities(self, data):
"""
Identify potential entities in the JSON data.
Args:
data: JSON data
Returns:
dict: Dictionary of potential entities
"""
entities = {}
# Helper function to process a dictionary
def process_dict(d, path=''):
for key, value in d.items():
key_lower = key.lower()
current_path = f"{path}.{key}" if path else key
# Check for name-related keys
if any(name_term in key_lower for name_term in ['name', 'user',
'person', 'customer', 'client']):
if isinstance(value, str):
if 'names' not in entities:
entities['names'] = []
entities['names'].append((current_path, value))
# Check for location-related keys
elif any(loc_term in key_lower for loc_term in ['city', 'state',
'country', 'address', 'location']):
if isinstance(value, str):
if 'locations' not in entities:
entities['locations'] = []
entities['locations'].append((current_path, value))
# Check for date-related keys
elif any(date_term in key_lower for date_term in ['date', 'time',
'day', 'year', 'month']):
if 'dates' not in entities:
entities['dates'] = []
entities['dates'].append(current_path)
# Recursively process nested dictionaries and lists
if isinstance(value, dict):
process_dict(value, current_path)
elif isinstance(value, list):
for i, item in enumerate(value[:5]): # Limit to first 5 items
if isinstance(item, dict):
process_dict(item, f"{current_path}[{i}]")
# Start processing
if isinstance(data, dict):
process_dict(data)
elif isinstance(data, list) and data and isinstance(data[0], dict):
for i, item in enumerate(data[:5]): # Limit to first 5 items
process_dict(item, f"[{i}]")
return entities