#!/usr/bin/env python3 """ Sync dosen data from SPOTA to Directus. Usage: python tools/sync_dosen_spota.py Setup: - Set env vars (DIRECTUS_TOKEN required), or - Copy .env.example to .env and fill DIRECTUS_TOKEN """ from __future__ import annotations import html import json import os import re import sys import urllib.error import urllib.parse import urllib.request from typing import Dict, List, Tuple def load_env_file(path: str) -> None: if not os.path.isfile(path): return with open(path, "r", encoding="utf-8") as f: for raw_line in f: line = raw_line.strip() if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) key = key.strip() value = value.strip() if (value.startswith('"') and value.endswith('"')) or ( value.startswith("'") and value.endswith("'") ): value = value[1:-1] if key and key not in os.environ: os.environ[key] = value def http_request( method: str, url: str, headers: Dict[str, str] | None = None, json_body: Dict | List | None = None, timeout: int = 30, ) -> Tuple[int, str]: payload = None req_headers = dict(headers or {}) if json_body is not None: payload = json.dumps(json_body, ensure_ascii=False).encode("utf-8") req_headers["Content-Type"] = "application/json" request = urllib.request.Request( url=url, data=payload, headers=req_headers, method=method.upper(), ) try: with urllib.request.urlopen(request, timeout=timeout) as response: body = response.read().decode("utf-8", errors="replace") return int(getattr(response, "status", 200)), body except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace") if e.fp else "" return int(e.code), body def strip_tags(text: str) -> str: clean = re.sub(r"<[^>]*>", "", text, flags=re.IGNORECASE | re.DOTALL) return html.unescape(clean).strip() def normalize_kk(kk: str, mapping: Dict[str, str]) -> str: value = (kk or "").strip() if not value: return "" return mapping.get(value, value) def normalize_photo_url(url: str, base_url: str) -> str: value = (url or "").strip() if not value: return "" if "noimageprofile" in value.lower(): return "" if value.startswith("//"): return "https:" + value if re.match(r"^https?://", value, flags=re.IGNORECASE): return value return urllib.parse.urljoin(base_url + "/", value) def parse_spota_table(html_doc: str, kk_mapping: Dict[str, str], spota_base: str) -> List[Dict[str, str]]: row_pattern = re.compile(r"]*>(.*?)", re.IGNORECASE | re.DOTALL) cell_pattern = re.compile(r"]*>(.*?)", re.IGNORECASE | re.DOTALL) img_pattern = re.compile(r"]+src=[\"']([^\"']+)[\"']", re.IGNORECASE) dosen: List[Dict[str, str]] = [] for row_html in row_pattern.findall(html_doc): cells = cell_pattern.findall(row_html) if len(cells) < 6: continue nama = strip_tags(cells[2]) nip = strip_tags(cells[3]) email_addr = strip_tags(cells[4]) kk = normalize_kk(strip_tags(cells[5]), kk_mapping) if not nama or nama.lower() == "nama": continue img_match = img_pattern.search(cells[1]) foto = normalize_photo_url(img_match.group(1) if img_match else "", spota_base) dosen.append( { "nama": nama, "nip": nip, "email": email_addr, "kk": kk, "foto": foto, } ) return dosen def parse_json(body: str, context: str) -> Dict: try: data = json.loads(body) except json.JSONDecodeError as e: raise RuntimeError(f"{context} returned invalid JSON: {e}") from e if not isinstance(data, dict): raise RuntimeError(f"{context} returned non-object JSON") return data def summarize(text: str, limit: int = 300) -> str: value = (text or "").strip() if not value: return "(empty response body)" return value[:limit] + ("…" if len(value) > limit else "") def main() -> int: load_env_file(os.path.join(os.getcwd(), ".env")) spota_url = os.getenv("SPOTA_URL", "https://spota.untan.ac.id/listdosen.php") directus_url = os.getenv("DIRECTUS_URL", "https://api.ifuntanhub.dev").rstrip("/") directus_token = os.getenv("DIRECTUS_TOKEN", "") if not directus_token: print("ERROR: DIRECTUS_TOKEN is required", file=sys.stderr) print("Create .env from .env.example or set runtime env variables", file=sys.stderr) return 1 kk_mapping = { "Computation & Artificial Intelligence": "Computation & Artificial Intelligence", "Networking & Security": "Networking & Security", "Software Engineering & Mobile Computing": "Software Engineering & Mobile Computing", "Information System & Data Spatial": "Information System & Data Spatial", "Computing & AI": "Computation & Artificial Intelligence", "Networks & Security": "Networking & Security", } print("Fetching data from SPOTA...") spota_status, spota_body = http_request( "GET", spota_url, headers={ "User-Agent": "Mozilla/5.0 (compatible; Informatika-UNTAN-Sync/1.0)", "Accept": "text/html,application/xhtml+xml", }, ) if not (200 <= spota_status < 300): print(f"SPOTA request failed ({spota_status}): {summarize(spota_body)}", file=sys.stderr) return 1 spota_data = parse_spota_table(spota_body, kk_mapping, "https://spota.untan.ac.id") print(f"Parsed {len(spota_data)} dosen from SPOTA") if not spota_data: print("No dosen data parsed from SPOTA. Aborting.", file=sys.stderr) return 1 directus_headers = { "Authorization": f"Bearer {directus_token}", "Accept": "application/json", } print("Fetching existing dosen from Directus...") existing_status, existing_body = http_request( "GET", f"{directus_url}/items/dosen?limit=-1&fields=id,nama,nip", headers=directus_headers, ) if not (200 <= existing_status < 300): print( f"Directus GET failed ({existing_status}): {summarize(existing_body)}", file=sys.stderr, ) return 1 existing_json = parse_json(existing_body, "Directus GET /items/dosen") existing_rows = existing_json.get("data", []) if not isinstance(existing_rows, list): existing_rows = [] existing_by_nip: Dict[str, str] = {} existing_by_name: Dict[str, str] = {} for row in existing_rows: if not isinstance(row, dict): continue row_id = row.get("id") if row_id is None: continue nip = str(row.get("nip") or "").strip() nama = str(row.get("nama") or "").strip().lower() if nip: existing_by_nip[nip] = str(row_id) if nama: existing_by_name[nama] = str(row_id) to_create = [] to_update = [] for d in spota_data: payload = { "nama": d.get("nama", ""), "nip": d.get("nip", ""), "email": d.get("email", ""), "kelompok_keahlian": d.get("kk", ""), # If you later add a dedicated URL field in Directus, map photo URL here. # "foto_url": d.get("foto", ""), } nip = str(d.get("nip") or "").strip() name_key = str(d.get("nama") or "").strip().lower() if nip and nip in existing_by_nip: to_update.append({"id": existing_by_nip[nip], "nama": payload["nama"], "data": payload}) continue if name_key and name_key in existing_by_name: to_update.append({"id": existing_by_name[name_key], "nama": payload["nama"], "data": payload}) continue to_create.append(payload) print(f"To create: {len(to_create)}, to update: {len(to_update)}") created_ok = 0 created_fail = 0 for item in to_create: status, body = http_request( "POST", f"{directus_url}/items/dosen", headers=directus_headers, json_body=item, ) if 200 <= status < 300: created_ok += 1 else: created_fail += 1 print( f"Create failed ({item.get('nama', 'unknown')}) [{status}]: {summarize(body)}", file=sys.stderr, ) updated_ok = 0 updated_fail = 0 for item in to_update: status, body = http_request( "PATCH", f"{directus_url}/items/dosen/{urllib.parse.quote(item['id'], safe='')}", headers=directus_headers, json_body=item["data"], ) if 200 <= status < 300: updated_ok += 1 else: updated_fail += 1 print( f"Update failed ({item.get('nama', 'unknown')}) [{status}]: {summarize(body)}", file=sys.stderr, ) print(f"Created: {created_ok}, failed: {created_fail}") print(f"Updated: {updated_ok}, failed: {updated_fail}") print("Sync complete.") return 1 if (created_fail > 0 or updated_fail > 0) else 0 def run_sync_logic() -> bool: code = main() if code != 0: raise RuntimeError("sync_dosen_spota failed") return True if __name__ == "__main__": raise SystemExit(main())