diff --git a/Dockerfile b/Dockerfile index e24a287..fa35dc9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,12 @@ FROM apache/airflow:2.7.1 USER root - -# Membuat folder dan membuka akses selebar-lebarnya (777) -RUN mkdir -p /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins \ - && chown -R airflow:root /opt/airflow \ - && chmod -R 777 /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins +# Buat folder scripts dan atur izin akses +RUN mkdir -p /opt/airflow/scripts +# Copy folder scripts dari Git ke container +COPY --chown=airflow:root ./scripts /opt/airflow/scripts +COPY --chown=airflow:root ./dags /opt/airflow/dags USER airflow - COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt \ No newline at end of file diff --git a/dags/main_pipeline.py b/dags/main_pipeline.py new file mode 100644 index 0000000..2300921 --- /dev/null +++ b/dags/main_pipeline.py @@ -0,0 +1,17 @@ +from airflow import DAG +from airflow.operators.python import PythonOperator +from datetime import datetime +# Import fungsi dari folder scripts +from scripts.scraper_pddikti import run_scraping_logic + +with DAG( + dag_id="etl_akademik_separated_v1", + start_date=datetime(2026, 3, 3), + schedule_interval="@daily", + catchup=False +) as dag: + + task_scraping = PythonOperator( + task_id="run_pddikti_scraping", + python_callable=run_scraping_logic # Memanggil fungsi eksternal + ) \ No newline at end of file diff --git a/scripts/.gitkeep b/scripts/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/scraper_pddikti.py b/scripts/scraper_pddikti.py new file mode 100644 index 0000000..14bbdb7 --- /dev/null +++ b/scripts/scraper_pddikti.py @@ -0,0 +1,13 @@ +import requests +import pandas as pd + +def run_scraping_logic(): + # Contoh scraping + url = "http://universities.hipolabs.com/search?country=Indonesia" + res = requests.get(url) + df = pd.DataFrame(res.json()) + + # Logika cleaning atau upload ke BigQuery ditaruh di sini + output_path = "/opt/airflow/logs/data_akademik.csv" + df.to_csv(output_path, index=False) + return f"Berhasil menarik {len(df)} data." \ No newline at end of file