added Modelfile

This commit is contained in:
root
2025-11-25 10:55:49 +00:00
parent 39ee7f6731
commit 444520651b
3 changed files with 210 additions and 28 deletions

125
main.py
View File

@@ -2,7 +2,6 @@ import os
import time
import imaplib
import email
import json
import requests
import logging
import sys
@@ -30,12 +29,16 @@ IMAP_PASS = os.environ.get("IMAP_PASS")
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://ollama-service.open-webui.svc:11434")
MODEL_NAME = os.environ.get("MODEL_NAME", "mail-router")
MAX_BODY_CHARS = int(os.environ.get("MAX_BODY_CHARS", "8000"))
MAX_BODY_CHARS = int(os.environ.get("MAX_BODY_CHARS", "2000"))
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", "300")) # v sekundách
OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120")) # read timeout v sekundách
OLLAMA_MAX_RETRIES = int(os.environ.get("OLLAMA_MAX_RETRIES", "3"))
# prahy jistoty
MIN_CONFIDENCE_DEFAULT = float(os.environ.get("MIN_CONFIDENCE_DEFAULT", "0.4"))
MIN_CONFIDENCE_RELAXED = float(os.environ.get("MIN_CONFIDENCE_RELAXED", "0.3"))
# povolené složky (whitelist) MUSÍ odpovídat tomu, co je v Modelfile
ALLOWED_FOLDERS = {
"INBOX",
@@ -50,9 +53,8 @@ ALLOWED_FOLDERS = {
"INBOX.ZTJ",
}
# tvrdá pravidla podle subjectu
# tvrdá pravidla podle subjectu Úkoly (faktury, vyúčtování)
HARDCODED_SUBJECT_RULES = [
# faktury / vyúčtování vždy do Úkoly
(re.compile(r"\bfaktura\b", re.IGNORECASE), "INBOX.Ukoly"),
(re.compile(r"\bvyúčtován[íi]\b|\bvyuctovan[íi]\b", re.IGNORECASE), "INBOX.Ukoly"),
(re.compile(r"\bdaňov[ýy]\s+doklad\b", re.IGNORECASE), "INBOX.Ukoly"),
@@ -65,6 +67,12 @@ HARDCODED_ZTJ_SUBJECT = re.compile(
re.IGNORECASE,
)
# marketing / newsletter hrubý pattern
HARDCODED_MARKETING_SUBJECT = re.compile(
r"\b(black friday|sleva|slevy|akce|speciální nabídka|newsletter|zpravodaj|unsubscribe)\b",
re.IGNORECASE,
)
def log_config():
"""Vypíše aktuální konfiguraci (bez hesla)."""
@@ -79,6 +87,8 @@ def log_config():
logger.info(f"CHECK_INTERVAL = {CHECK_INTERVAL} s")
logger.info(f"OLLAMA_TIMEOUT = {OLLAMA_TIMEOUT} s")
logger.info(f"OLLAMA_MAX_RETRIES = {OLLAMA_MAX_RETRIES}")
logger.info(f"MIN_CONF_DEFAULT = {MIN_CONFIDENCE_DEFAULT}")
logger.info(f"MIN_CONF_RELAXED = {MIN_CONFIDENCE_RELAXED}")
logger.info(f"LOG_LEVEL = {LOG_LEVEL}")
logger.info(f"ALLOWED_FOLDERS = {sorted(ALLOWED_FOLDERS)}")
logger.info("====================================")
@@ -176,14 +186,14 @@ def warmup_model():
payload = {
"model": MODEL_NAME,
"stream": False,
"format": "json",
"messages": [
{
"role": "user",
"content": (
"HEADERS:\nFrom: warmup@example.com\nSubject: warmup\n\n"
"BODY:\nThis is a warmup request, respond with a valid JSON "
"using folder INBOX and confidence 0."
"BODY:\nThis is a warmup request. "
"Odpověz přesně ve formátu:\n"
"FOLDER: INBOX\nCONFIDENCE: 0.0\nREASON: warmup\nRULES:\n- warmup"
),
}
],
@@ -191,19 +201,73 @@ def warmup_model():
try:
r = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=OLLAMA_TIMEOUT)
r.raise_for_status()
content = r.json().get("message", {}).get("content", "")
data = r.json()
content = data.get("message", {}).get("content", "")
logger.info(f"Warm-up response (first 200 chars): {content[:200].replace(chr(10), ' ')}")
except Exception as e:
logger.warning(f"Warm-up failed (will continue anyway): {e}")
# ---------- Parsing model output ----------
FOLDER_RE = re.compile(r"^FOLDER:\s*(.+)$", re.MULTILINE)
CONF_RE = re.compile(r"^CONFIDENCE:\s*([0-9.]+)", re.MULTILINE)
REASON_RE = re.compile(r"^REASON:\s*(.+)$", re.MULTILINE)
RULES_RE = re.compile(r"^RULES:\s*(.*)$", re.MULTILINE)
def parse_model_output(content: str) -> dict:
"""
Očekávaný formát:
FOLDER: INBOX.Pracovni
CONFIDENCE: 0.8
REASON: ...
RULES:
- ...
- ...
Vrací dict {folder, confidence, reason, rules} nebo vyhodí výjimku.
"""
folder_match = FOLDER_RE.search(content)
conf_match = CONF_RE.search(content)
reason_match = REASON_RE.search(content)
rules_match = RULES_RE.search(content)
if not folder_match or not conf_match or not reason_match or not rules_match:
raise ValueError("Missing one of FOLDER/CONFIDENCE/REASON/RULES in model output")
folder = folder_match.group(1).strip()
try:
confidence = float(conf_match.group(1))
except Exception:
confidence = 0.0
reason = reason_match.group(1).strip()
# rules: vezmeme vše od řádku po "RULES:" dál
rules_start = rules_match.end()
rules_block = content[rules_start:].strip()
rules = []
for line in rules_block.splitlines():
line = line.strip()
if line.startswith("- "):
rules.append(line[2:].strip())
return {
"folder": folder,
"confidence": confidence,
"reason": reason,
"rules": rules,
}
# ---------- LLM call ----------
def classify_email(prompt):
payload = {
"model": MODEL_NAME,
"stream": False,
"format": "json",
"messages": [
{"role": "user", "content": prompt}
],
@@ -228,12 +292,8 @@ def classify_email(prompt):
f"Model returned content (first 300 chars): "
f"{content[:300].replace(chr(10), ' ')}"
)
try:
result = json.loads(content)
except Exception as e:
logger.error(f"Error parsing JSON from model content: {e}")
logger.debug(f"Raw content was: {content}")
raise
result = parse_model_output(content)
logger.info(f"Parsed model result: {result}")
return result
@@ -245,7 +305,8 @@ def classify_email(prompt):
logger.info(f"Retrying in {backoff} seconds...")
time.sleep(backoff)
except Exception as e:
# ostatní chyby nemá smysl retryovat
# ostatní chyby nemá smysl retryovat (rozbitý output apod.)
last_exc = e
logger.error(f"Ollama request failed (non-retryable): {e}")
logger.debug(traceback.format_exc())
raise
@@ -257,7 +318,7 @@ def classify_email(prompt):
def normalize_folder(result, msg):
"""
Vrátí cílovou složku:
1) nejdřív tvrdá pravidla (ZTJ, faktury → Úkoly),
1) nejdřív tvrdá pravidla (ZTJ, faktury → Úkoly, marketing → Zpravodaje),
2) pak výsledek z modelu + threshold + whitelist.
"""
subject = msg.get("Subject", "") or ""
@@ -267,6 +328,11 @@ def normalize_folder(result, msg):
logger.info("Hardcoded ZTJ rule matched subject, forcing folder=INBOX.ZTJ")
return "INBOX.ZTJ"
# marketing / newsletter → Zpravodaje
if HARDCODED_MARKETING_SUBJECT.search(subject):
logger.info("Hardcoded marketing rule matched subject, forcing folder=INBOX.Zpravodaje")
return "INBOX.Zpravodaje"
# faktury / vyúčtování → Úkoly
for pattern, folder in HARDCODED_SUBJECT_RULES:
if pattern.search(subject):
@@ -278,21 +344,28 @@ def normalize_folder(result, msg):
# jinak necháme rozhodnout model
folder = result.get("folder", "INBOX")
try:
confidence = float(result.get("confidence", 0.0))
except Exception:
confidence = 0.0
confidence = float(result.get("confidence", 0.0) or 0.0)
logger.info(f"Model suggested folder={folder}, confidence={confidence}")
if confidence < 0.5:
logger.info(f"Low confidence ({confidence}), using INBOX as fallback")
return "INBOX"
# neznámá složka → INBOX
if folder not in ALLOWED_FOLDERS:
logger.warning(f"Folder {folder} not in ALLOWED_FOLDERS, using INBOX")
return "INBOX"
# pro spam/newsletter buďme benevolentnější
if folder in ("INBOX.Nepodstatne", "INBOX.Zpravodaje"):
threshold = MIN_CONFIDENCE_RELAXED
else:
threshold = MIN_CONFIDENCE_DEFAULT
if confidence < threshold:
logger.info(
f"Low confidence ({confidence}) for folder {folder} "
f"(threshold {threshold}), using INBOX as fallback"
)
return "INBOX"
return folder
@@ -376,6 +449,7 @@ def process_once():
except Exception as e:
logger.error(f"Error calling model for message {msg_id_str}: {e}")
logger.debug(traceback.format_exc())
# necháme zprávu v INBOXu, zpracuje se později
continue
target_folder = normalize_folder(result, msg)
@@ -397,7 +471,6 @@ def process_once():
def main():
logger.info("mail-classifier starting up...")
log_config()
# warm-up modelu, aby první reálný request netimeoutoval
warmup_model()
while True:
try: