new etl listdosen from spota

2026-03-30 22:01:42 +07:00
parent 82ce018913
commit d36c9b663f
2 changed files with 322 additions and 31 deletions
@@ -5,9 +5,9 @@ from datetime import datetime
 from scripts.scraper_pddikti import run_scraping_logic

 with DAG(
-    dag_id="etl_akademik_separated_v1",
+    dag_id="sync_dosen_spota_to_directus",
    start_date=datetime(2026, 3, 3),
-    schedule_interval="@daily",
+    schedule_interval="@monthly",
    catchup=False
 ) as dag:

@@ -1,31 +1,322 @@
-import requests
-import pandas as pd
+#!/usr/bin/env python3
+"""
+Sync dosen data from SPOTA to Directus.
+
+Usage:
+    python tools/sync_dosen_spota.py
+
+Setup:
+    - Set env vars (DIRECTUS_TOKEN required), or
+    - Copy .env.example to .env and fill DIRECTUS_TOKEN
+"""
+
+from __future__ import annotations
+
+import html
+import json
+import os
+import re
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+from typing import Dict, List, Tuple
+
+
+def load_env_file(path: str) -> None:
+    if not os.path.isfile(path):
+        return
+
+    with open(path, "r", encoding="utf-8") as f:
+        for raw_line in f:
+            line = raw_line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+
+            key, value = line.split("=", 1)
+            key = key.strip()
+            value = value.strip()
+
+            if (value.startswith('"') and value.endswith('"')) or (
+                value.startswith("'") and value.endswith("'")
+            ):
+                value = value[1:-1]
+
+            if key and key not in os.environ:
+                os.environ[key] = value
+
+
+def http_request(
+    method: str,
+    url: str,
+    headers: Dict[str, str] | None = None,
+    json_body: Dict | List | None = None,
+    timeout: int = 30,
+) -> Tuple[int, str]:
+    payload = None
+    req_headers = dict(headers or {})
+
+    if json_body is not None:
+        payload = json.dumps(json_body, ensure_ascii=False).encode("utf-8")
+        req_headers["Content-Type"] = "application/json"
+
+    request = urllib.request.Request(
+        url=url,
+        data=payload,
+        headers=req_headers,
+        method=method.upper(),
+    )

-def run_scraping_logic():
-    print("--- MEMULAI PROSES PENARIKAN DATA DUMMY ---")
-    
-    # URL API Publik (Data Universitas)
-    url = "http://universities.hipolabs.com/search?country=Indonesia"
-    
    try:
-        # 1. Extract
-        response = requests.get(url, timeout=10)
-        data = response.json()
-        
-        # 2. Transform (Gunakan Pandas)
-        df = pd.DataFrame(data)
-        
-        # Kita ambil 5 data teratas saja untuk ditampilkan di log
-        preview_data = df[['name', 'web_pages']].head(5)
-        
-        # 3. Load (Simulasi: Tampilkan ke Log Airflow)
-        print("HASIL PENARIKAN DATA (5 Teratas):")
-        print("====================================================")
-        print(preview_data.to_string(index=False))
-        print("====================================================")
-        print(f"Total data yang berhasil ditarik: {len(df)} baris.")
-        
-        return True
-    except Exception as e:
-        print(f"TERJADI KESALAHAN: {str(e)}")
-        return False
+        with urllib.request.urlopen(request, timeout=timeout) as response:
+            body = response.read().decode("utf-8", errors="replace")
+            return int(getattr(response, "status", 200)), body
+    except urllib.error.HTTPError as e:
+        body = e.read().decode("utf-8", errors="replace") if e.fp else ""
+        return int(e.code), body
+
+
+def strip_tags(text: str) -> str:
+    clean = re.sub(r"<[^>]*>", "", text, flags=re.IGNORECASE | re.DOTALL)
+    return html.unescape(clean).strip()
+
+
+def normalize_kk(kk: str, mapping: Dict[str, str]) -> str:
+    value = (kk or "").strip()
+    if not value:
+        return ""
+    return mapping.get(value, value)
+
+
+def normalize_photo_url(url: str, base_url: str) -> str:
+    value = (url or "").strip()
+    if not value:
+        return ""
+    if "noimageprofile" in value.lower():
+        return ""
+    if value.startswith("//"):
+        return "https:" + value
+    if re.match(r"^https?://", value, flags=re.IGNORECASE):
+        return value
+    return urllib.parse.urljoin(base_url + "/", value)
+
+
+def parse_spota_table(html_doc: str, kk_mapping: Dict[str, str], spota_base: str) -> List[Dict[str, str]]:
+    row_pattern = re.compile(r"<tr[^>]*>(.*?)</tr>", re.IGNORECASE | re.DOTALL)
+    cell_pattern = re.compile(r"<td[^>]*>(.*?)</td>", re.IGNORECASE | re.DOTALL)
+    img_pattern = re.compile(r"<img[^>]+src=[\"']([^\"']+)[\"']", re.IGNORECASE)
+
+    dosen: List[Dict[str, str]] = []
+
+    for row_html in row_pattern.findall(html_doc):
+        cells = cell_pattern.findall(row_html)
+        if len(cells) < 6:
+            continue
+
+        nama = strip_tags(cells[2])
+        nip = strip_tags(cells[3])
+        email_addr = strip_tags(cells[4])
+        kk = normalize_kk(strip_tags(cells[5]), kk_mapping)
+
+        if not nama or nama.lower() == "nama":
+            continue
+
+        img_match = img_pattern.search(cells[1])
+        foto = normalize_photo_url(img_match.group(1) if img_match else "", spota_base)
+
+        dosen.append(
+            {
+                "nama": nama,
+                "nip": nip,
+                "email": email_addr,
+                "kk": kk,
+                "foto": foto,
+            }
+        )
+
+    return dosen
+
+
+def parse_json(body: str, context: str) -> Dict:
+    try:
+        data = json.loads(body)
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"{context} returned invalid JSON: {e}") from e
+
+    if not isinstance(data, dict):
+        raise RuntimeError(f"{context} returned non-object JSON")
+
+    return data
+
+
+def summarize(text: str, limit: int = 300) -> str:
+    value = (text or "").strip()
+    if not value:
+        return "(empty response body)"
+    return value[:limit] + ("…" if len(value) > limit else "")
+
+
+def main() -> int:
+    load_env_file(os.path.join(os.getcwd(), ".env"))
+
+    spota_url = os.getenv("SPOTA_URL", "https://spota.untan.ac.id/listdosen.php")
+    directus_url = os.getenv("DIRECTUS_URL", "https://api.ifuntanhub.dev").rstrip("/")
+    directus_token = os.getenv("DIRECTUS_TOKEN", "")
+
+    if not directus_token:
+        print("ERROR: DIRECTUS_TOKEN is required", file=sys.stderr)
+        print("Create .env from .env.example or set runtime env variables", file=sys.stderr)
+        return 1
+
+    kk_mapping = {
+        "Computation & Artificial Intelligence": "Computation & Artificial Intelligence",
+        "Networking & Security": "Networking & Security",
+        "Software Engineering & Mobile Computing": "Software Engineering & Mobile Computing",
+        "Information System & Data Spatial": "Information System & Data Spatial",
+        "Computing & AI": "Computation & Artificial Intelligence",
+        "Networks & Security": "Networking & Security",
+    }
+
+    print("Fetching data from SPOTA...")
+    spota_status, spota_body = http_request(
+        "GET",
+        spota_url,
+        headers={
+            "User-Agent": "Mozilla/5.0 (compatible; Informatika-UNTAN-Sync/1.0)",
+            "Accept": "text/html,application/xhtml+xml",
+        },
+    )
+
+    if not (200 <= spota_status < 300):
+        print(f"SPOTA request failed ({spota_status}): {summarize(spota_body)}", file=sys.stderr)
+        return 1
+
+    spota_data = parse_spota_table(spota_body, kk_mapping, "https://spota.untan.ac.id")
+    print(f"Parsed {len(spota_data)} dosen from SPOTA")
+
+    if not spota_data:
+        print("No dosen data parsed from SPOTA. Aborting.", file=sys.stderr)
+        return 1
+
+    directus_headers = {
+        "Authorization": f"Bearer {directus_token}",
+        "Accept": "application/json",
+    }
+
+    print("Fetching existing dosen from Directus...")
+    existing_status, existing_body = http_request(
+        "GET",
+        f"{directus_url}/items/dosen?limit=-1&fields=id,nama,nip",
+        headers=directus_headers,
+    )
+
+    if not (200 <= existing_status < 300):
+        print(
+            f"Directus GET failed ({existing_status}): {summarize(existing_body)}",
+            file=sys.stderr,
+        )
+        return 1
+
+    existing_json = parse_json(existing_body, "Directus GET /items/dosen")
+    existing_rows = existing_json.get("data", [])
+    if not isinstance(existing_rows, list):
+        existing_rows = []
+
+    existing_by_nip: Dict[str, str] = {}
+    existing_by_name: Dict[str, str] = {}
+
+    for row in existing_rows:
+        if not isinstance(row, dict):
+            continue
+        row_id = row.get("id")
+        if row_id is None:
+            continue
+
+        nip = str(row.get("nip") or "").strip()
+        nama = str(row.get("nama") or "").strip().lower()
+
+        if nip:
+            existing_by_nip[nip] = str(row_id)
+        if nama:
+            existing_by_name[nama] = str(row_id)
+
+    to_create = []
+    to_update = []
+
+    for d in spota_data:
+        payload = {
+            "nama": d.get("nama", ""),
+            "nip": d.get("nip", ""),
+            "email": d.get("email", ""),
+            "kelompok_keahlian": d.get("kk", ""),
+            # If you later add a dedicated URL field in Directus, map photo URL here.
+            # "foto_url": d.get("foto", ""),
+        }
+
+        nip = str(d.get("nip") or "").strip()
+        name_key = str(d.get("nama") or "").strip().lower()
+
+        if nip and nip in existing_by_nip:
+            to_update.append({"id": existing_by_nip[nip], "nama": payload["nama"], "data": payload})
+            continue
+
+        if name_key and name_key in existing_by_name:
+            to_update.append({"id": existing_by_name[name_key], "nama": payload["nama"], "data": payload})
+            continue
+
+        to_create.append(payload)
+
+    print(f"To create: {len(to_create)}, to update: {len(to_update)}")
+
+    created_ok = 0
+    created_fail = 0
+    for item in to_create:
+        status, body = http_request(
+            "POST",
+            f"{directus_url}/items/dosen",
+            headers=directus_headers,
+            json_body=item,
+        )
+        if 200 <= status < 300:
+            created_ok += 1
+        else:
+            created_fail += 1
+            print(
+                f"Create failed ({item.get('nama', 'unknown')}) [{status}]: {summarize(body)}",
+                file=sys.stderr,
+            )
+
+    updated_ok = 0
+    updated_fail = 0
+    for item in to_update:
+        status, body = http_request(
+            "PATCH",
+            f"{directus_url}/items/dosen/{urllib.parse.quote(item['id'], safe='')}",
+            headers=directus_headers,
+            json_body=item["data"],
+        )
+        if 200 <= status < 300:
+            updated_ok += 1
+        else:
+            updated_fail += 1
+            print(
+                f"Update failed ({item.get('nama', 'unknown')}) [{status}]: {summarize(body)}",
+                file=sys.stderr,
+            )
+
+    print(f"Created: {created_ok}, failed: {created_fail}")
+    print(f"Updated: {updated_ok}, failed: {updated_fail}")
+    print("Sync complete.")
+
+    return 1 if (created_fail > 0 or updated_fail > 0) else 0
+
+
+def run_sync_logic() -> bool:
+    code = main()
+    if code != 0:
+        raise RuntimeError("sync_dosen_spota failed")
+    return True
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())