import os
import shutil
import re

RAW = os.path.expanduser("~/Desktop/Alex Wiki/raw-notes")
BASE = os.path.expanduser("~/Desktop/Alex Wiki")

# Category mapping
categories = {
    "motherlabs/compiler": [],
    "motherlabs/architecture": [],
    "motherlabs/strategy": [],
    "ai-research/agents": [],
    "ai-research/models": [],
    "ai-research/theory": [],
    "projects/tattooclaw": [],
    "projects/aigency": [],
    "projects/misc": [],
    "personal/ideas": [],
    "personal/reflections": [],
    "personal/learning": [],
    "uncategorized": [],
}

def categorize(filename, content):
    fn = filename.upper()
    cl = content.lower()
    content_len = len(content.strip())
    
    # Skip API keys / secrets - put in uncategorized
    if "API_KEY" in filename or "api_key" in filename or "sk-ant-" in content or "sk-proj-" in content:
        if content_len < 200:
            return "uncategorized"
    
    # Rule 12: CAN'T HURT ME
    if "CANT-HURT-ME" in fn or "CAN'T HURT ME" in fn or "CANT HURT ME" in fn:
        return "personal/learning"
    
    # Rule 13: PLAYBOOK
    if "PLAYBOOK" in fn:
        return "motherlabs/strategy"
    
    # Rule 14: BOOTSTRAP FUTURE - check content
    if "BOOTSTRAP-FUTURE" in fn or "BOOTSTRAP FUTURE" in fn:
        arch_kw = ["schema", "ontology", "world model", "invariant", "axiom", "type system", "typed", "architecture"]
        if any(k in cl for k in arch_kw):
            return "motherlabs/architecture"
        return "motherlabs/strategy"
    
    # Rule 8: compiler keywords
    compiler_kw = ["qrpt", "h(y|x", "h(y", "semantic compiler", "compilation stage", "compilation pipeline",
                   "ctx→", "int→", "per→", "ent→", "pro→", "syn→", "ver→", "gov→", "bld→",
                   "compiler stage", "9-stage", "nine stage", "postcode", "kompilator"]
    if any(k in cl for k in compiler_kw):
        return "motherlabs/compiler"
    if "ada" in cl and ("compil" in cl or "pipeline" in cl or "stage" in cl or "intent" in cl):
        return "motherlabs/compiler"
    
    # Rule 9: architecture keywords
    arch_kw = ["ontology", "world model", "worldmodel", "schema", "axiom", "invariant", "seed protocol", "seed-protocol"]
    if any(k in cl for k in arch_kw):
        return "motherlabs/architecture"
    
    # Rule 10: agent keywords
    agent_kw = ["openclaw", "motherclaw", "david 8", "david-8", "multi-agent", "multiagent", "orchestrat",
                "autonomous agent", "agent system", "agent runtime", "agentic"]
    if any(k in cl for k in agent_kw):
        return "ai-research/agents"
    if "agent" in cl and ("ai" in cl or "llm" in cl or "autonom" in cl):
        return "ai-research/agents"
    
    # Rule 11: theory keywords
    theory_kw = ["consciousness", "abiogenesis", "emergence", "quantum", "cosmolog", "philosophy of mind",
                 "strange loop", "entropy", "thermodynamic", "constraint theory", "free energy",
                 "boltzmann", "information theory"]
    if any(k in cl for k in theory_kw):
        # "strange loop" about self could be reflections
        if "strange loop" in cl and ("myself" in cl or "my mind" in cl or "adhd" in cl or "i am" in cl):
            return "personal/reflections"
        return "ai-research/theory"
    
    # TattooClaw product
    tattoo_product_kw = ["tattooclaw", "tattoo ai", "tattoo os", "tattoo software", "tattoo platform",
                         "tattoo automat", "tattoo product"]
    if any(k in cl for k in tattoo_product_kw):
        return "projects/tattooclaw"
    
    # AIgency
    if "aigency" in cl or ("agency" in cl and ("smb" in cl or "ai service" in cl)):
        return "projects/aigency"
    
    # Motherlabs general - check filename prefixes
    mlabs_prefixes = ["MLABS--", "MOTHERLABS--", "manual-MOTHERLABS", "MOTHER-"]
    if any(fn.startswith(p.upper()) for p in mlabs_prefixes):
        # Sub-categorize
        if any(k in cl for k in ["vision", "business", "strategy", "market", "position", "fundrais", "pitch", "revenue", "customer"]):
            return "motherlabs/strategy"
        if any(k in cl for k in ["compil", "pipeline", "stage", "intent", "qrpt", "h(y"]):
            return "motherlabs/compiler"
        if any(k in cl for k in ["schema", "ontolog", "architect", "design", "system design", "type"]):
            return "motherlabs/architecture"
        if any(k in cl for k in ["agent", "orchestr", "autonom"]):
            return "ai-research/agents"
        if any(k in cl for k in ["prompt", "model", "claude", "gpt", "llm"]):
            return "ai-research/models"
        # Default mlabs -> compiler (core product)
        return "motherlabs/compiler"
    
    # Models / prompting
    model_kw = ["claude", "gpt-4", "gpt-3", "openai", "anthropic", "fine-tun", "prompting", "prompt engineering",
                "llm", "language model", "token", "context window", "embedding"]
    if any(k in cl for k in model_kw):
        # Could be about using models for agents
        if "agent" in cl:
            return "ai-research/agents"
        return "ai-research/models"
    
    # Projects - websites, apps, tools
    project_kw = ["website", "app idea", "side project", "music prod", "ableton", "creative project",
                  "domain name", "landing page"]
    if any(k in cl for k in project_kw):
        return "projects/misc"
    
    # Personal reflections
    reflect_kw = ["adhd", "self-aware", "personal growth", "motivation", "procrastinat", "my pattern",
                  "i realize", "i noticed", "i feel", "my life", "self-reflect", "burnout", "mental health",
                  "therapy", "meditat"]
    if any(k in cl for k in reflect_kw):
        return "personal/reflections"
    
    # Learning
    learn_kw = ["book note", "course note", "tutorial", "studying", "learning", "notes from", "chapter "]
    if any(k in cl for k in learn_kw):
        return "personal/learning"
    
    # Short fragments -> personal/ideas
    if content_len < 100:
        return "personal/ideas"
    
    # Latvian language detection (common Latvian words)
    latvian_kw = ["ir ", "kas ", "nav ", "būt", "vai ", "arī ", "bet ", "kad ", "kā ", "par ", "man ", "tas ",
                  "šis ", "labi", "ā ", "ē ", "ī ", "ū ", "ļ", "ņ", "ģ", "ķ", "ž", "č", "š"]
    latvian_project_kw = ["projekts", "darbs", "kompilator", "sistēma"]
    latvian_count = sum(1 for k in latvian_kw if k in content.lower())
    if latvian_count >= 3:
        if any(k in cl for k in latvian_project_kw):
            return "motherlabs/compiler"
        return "personal/ideas"
    
    # Broader Motherlabs detection
    if "motherlabs" in cl or "mother labs" in cl:
        return "motherlabs/strategy"
    
    # Broader compiler/ada detection
    if "ada " in cl and ("build" in cl or "system" in cl or "code" in cl):
        return "motherlabs/compiler"
    
    # Broader agent detection  
    if "agent" in cl:
        return "ai-research/agents"
    
    # Technology / code related
    tech_kw = ["typescript", "javascript", "python", "react", "node", "api", "database", "docker",
               "deploy", "github", "git ", "npm", "pnpm", "bun ", "turbo"]
    if any(k in cl for k in tech_kw):
        return "projects/misc"
    
    # Recipes, shopping, personal misc
    personal_misc_kw = ["recipe", "shopping", "grocery", "ingredients", "cook", "dinner", "lunch",
                        "breakfast", "workout", "exercise", "gym"]
    if any(k in cl for k in personal_misc_kw):
        return "personal/ideas"
    
    # If content is very short or fragmentary
    if content_len < 300:
        return "personal/ideas"
    
    return "uncategorized"


# Process all files
results = {}
files = sorted(os.listdir(RAW))
for f in files:
    if not f.endswith(".md"):
        continue
    fpath = os.path.join(RAW, f)
    try:
        with open(fpath, 'r', encoding='utf-8', errors='replace') as fh:
            content = fh.read(2000)  # Read more for better classification but use first part
    except:
        content = ""
    
    cat = categorize(f, content[:2000])
    categories[cat].append(f)
    results[f] = cat

# Copy files
for cat, flist in categories.items():
    dest_dir = os.path.join(BASE, cat)
    os.makedirs(dest_dir, exist_ok=True)
    for f in flist:
        src = os.path.join(RAW, f)
        dst = os.path.join(dest_dir, f)
        shutil.copy2(src, dst)

# Print summary
total = 0
print("=== CATEGORIZATION SUMMARY ===\n")
for cat in sorted(categories.keys()):
    count = len(categories[cat])
    total += count
    print(f"{cat}: {count}")
print(f"\nTOTAL: {total}")

# Print uncategorized files for review
if categories["uncategorized"]:
    print(f"\n=== UNCATEGORIZED FILES ({len(categories['uncategorized'])}) ===")
    for f in categories["uncategorized"]:
        print(f"  {f}")

# Generate index.md
index_path = os.path.join(BASE, "index.md")
with open(index_path, 'w') as idx:
    idx.write("# Alex Wiki — Index\n\n")
    idx.write(f"Generated: 2026-04-03\n")
    idx.write(f"Total files: {total}\n\n")
    
    cat_labels = {
        "motherlabs/compiler": "Motherlabs / Compiler",
        "motherlabs/architecture": "Motherlabs / Architecture",
        "motherlabs/strategy": "Motherlabs / Strategy",
        "ai-research/agents": "AI Research / Agents",
        "ai-research/models": "AI Research / Models",
        "ai-research/theory": "AI Research / Theory",
        "projects/tattooclaw": "Projects / TattooClaw",
        "projects/aigency": "Projects / AIgency",
        "projects/misc": "Projects / Misc",
        "personal/ideas": "Personal / Ideas",
        "personal/reflections": "Personal / Reflections",
        "personal/learning": "Personal / Learning",
        "uncategorized": "Uncategorized",
    }
    
    for cat in sorted(categories.keys()):
        if not categories[cat]:
            continue
        label = cat_labels.get(cat, cat)
        idx.write(f"## {label} ({len(categories[cat])})\n\n")
        for f in sorted(categories[cat]):
            title = f.replace(".md", "").replace("--", " / ").replace("-", " ")
            idx.write(f"- [{title}]({cat}/{f})\n")
        idx.write("\n")

print(f"\nIndex written to: {index_path}")