#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Analyze tag duplicates in tags.json
"""
import json
import re
from collections import defaultdict

# Load tags
with open('tags.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

tags = data.get('tags', [])
print(f"=== TAG DUPLICATE ANALYSIS ===\n")
print(f"Total tags: {len(tags)}\n")

# Inappropriate tags blacklist
BLACKLIST_KEYWORDS = [
    'умереть', 'умерет', 'смерть', 'похорон', 'могил', 'кладбищ', 'гроб', 'умирать',
    'секс', 'порн', 'эротик', 'интим',
    'наркот',
    'азарт', 'казино', 'ставк', 'букмекер',
    'убий', 'насил',
]

def is_appropriate(label):
    label_lower = label.lower()
    for word in BLACKLIST_KEYWORDS:
        if word in label_lower:
            return False
    return True

# Filter inappropriate tags
appropriate_tags = [t for t in tags if is_appropriate(t.get('label', ''))]
inappropriate_count = len(tags) - len(appropriate_tags)
print(f"Inappropriate tags found: {inappropriate_count}")
if inappropriate_count > 0:
    print("Inappropriate tags:")
    for t in tags:
        if not is_appropriate(t.get('label', '')):
            print(f"  - {t.get('label')} ({t.get('events_count', 0)} events) [slug: {t.get('slug')}]")
    print()

# Group by similar base (first 4-5 characters)
grouped = defaultdict(list)
for tag in appropriate_tags:
    label = tag.get('label', '').lower()
    if len(label) < 2:
        continue

    # Use first 5 chars as base
    base = label[:5] if len(label) >= 5 else label[:4]

    grouped[base].append({
        'label': tag.get('label'),
        'slug': tag.get('slug'),
        'events_count': tag.get('events_count', 0)
    })

# Find duplicates (groups with 2+ tags)
duplicates = {}
for base, tag_list in grouped.items():
    if len(tag_list) > 1:
        # Sort by events count descending
        tag_list.sort(key=lambda x: x['events_count'], reverse=True)
        total_events = sum(t['events_count'] for t in tag_list)

        # Only show groups with significant events
        if total_events >= 5:
            duplicates[base] = {
                'tags': tag_list,
                'total_events': total_events
            }

print(f"=== DUPLICATE GROUPS FOUND: {len(duplicates)} ===\n")

# Sort duplicate groups by total events (most significant first)
sorted_duplicates = sorted(duplicates.items(), key=lambda x: x[1]['total_events'], reverse=True)

# Show top 50 duplicate groups
print("Top 50 duplicate groups:\n")
for i, (base, group) in enumerate(sorted_duplicates[:50]):
    total = group['total_events']
    print(f"#{i+1} Base: '{base}' (total events: {total})")
    for tag in group['tags']:
        print(f"    - {tag['label']} ({tag['events_count']} events) [slug: {tag['slug']}]")
    print()

# Create blacklist (hide all but most popular from each group)
blacklist_slugs = []
for base, group in duplicates.items():
    tags_in_group = group['tags']
    if len(tags_in_group) < 2:
        continue

    # Keep first (most events), blacklist others
    for i in range(1, len(tags_in_group)):
        blacklist_slugs.append(tags_in_group[i]['slug'])

print(f"\n=== BLACKLIST RECOMMENDATION ===")
print(f"Tags to hide: {len(blacklist_slugs)}")
print(f"\nFirst 30 slugs to add to blacklist:")
for i, slug in enumerate(blacklist_slugs[:30]):
    print(f"  '{slug}',")

print(f"\n... and {len(blacklist_slugs) - 30} more")

# Show statistics
print(f"\n=== STATISTICS ===")
print(f"Total appropriate tags: {len(appropriate_tags)}")
print(f"Tags in duplicate groups: {sum(len(g['tags']) for g in duplicates.values())}")
print(f"Tags recommended for hiding: {len(blacklist_slugs)}")
print(f"Remaining visible tags: {len(appropriate_tags) - len(blacklist_slugs) - inappropriate_count}")

print("\n=== END ===")
