import os
from sentence_transformers import SentenceTransformer
import chromadb
from PyPDF2 import PdfReader

# Pfade
DATA_DIR = './raw/'
INDEX_DIR = './index/'
os.makedirs(INDEX_DIR, exist_ok=True)

# Modell und Chroma-Client
model = SentenceTransformer('all-MiniLM-L6-v2')
client = chromadb.PersistentClient(path=INDEX_DIR)
collection = client.get_or_create_collection(name="robot_docs")

def parse_file(file_path):
    """Text aus Datei extrahieren."""
    text = ""
    if file_path.endswith('.pdf'):
        try:
            reader = PdfReader(file_path)
            for page in reader.pages:
                text += page.extract_text() + "\n"
        except Exception as e:
            print(f"Fehler beim Parsen von {file_path}: {e}")
    elif file_path.endswith('.txt'):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        except Exception as e:
            print(f"Fehler beim Lesen von {file_path}: {e}")
    return text

def chunk_text(text, chunk_size=500):
    """Text in Chunks teilen (Wörter)."""
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def index_files():
    """Alle Dateien indexieren."""
    client.delete_collection(name="robot_docs")  # Lösche alte Collection
    collection = client.create_collection(name="robot_docs")  # Neue Collection
    for filename in os.listdir(DATA_DIR):
        file_path = os.path.join(DATA_DIR, filename)
        if os.path.isfile(file_path):
            print(f"Verarbeite {filename}...")
            text = parse_file(file_path)
            if text:
                chunks = chunk_text(text)
                embeddings = model.encode(chunks)
                collection.add(
                    documents=chunks,
                    embeddings=embeddings.tolist(),
                    metadatas=[{"file": filename, "chunk_id": i} for i in range(len(chunks))],
                    ids=[f"{filename}_{i}" for i in range(len(chunks))]
                )
    print("Indexierung abgeschlossen.")

if __name__ == "__main__":
    index_files()