init new dags

This commit is contained in:
Power BI Dev
2026-03-03 16:23:51 +07:00
parent cb3994eb34
commit 0bd5fdc297
4 changed files with 35 additions and 6 deletions

View File

@@ -1,13 +1,12 @@
FROM apache/airflow:2.7.1
USER root
# Membuat folder dan membuka akses selebar-lebarnya (777)
RUN mkdir -p /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins \
&& chown -R airflow:root /opt/airflow \
&& chmod -R 777 /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins
# Buat folder scripts dan atur izin akses
RUN mkdir -p /opt/airflow/scripts
# Copy folder scripts dari Git ke container
COPY --chown=airflow:root ./scripts /opt/airflow/scripts
COPY --chown=airflow:root ./dags /opt/airflow/dags
USER airflow
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

17
dags/main_pipeline.py Normal file
View File

@@ -0,0 +1,17 @@
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
# Import fungsi dari folder scripts
from scripts.scraper_pddikti import run_scraping_logic
with DAG(
dag_id="etl_akademik_separated_v1",
start_date=datetime(2026, 3, 3),
schedule_interval="@daily",
catchup=False
) as dag:
task_scraping = PythonOperator(
task_id="run_pddikti_scraping",
python_callable=run_scraping_logic # Memanggil fungsi eksternal
)

0
scripts/.gitkeep Normal file
View File

View File

@@ -0,0 +1,13 @@
import requests
import pandas as pd
def run_scraping_logic():
# Contoh scraping
url = "http://universities.hipolabs.com/search?country=Indonesia"
res = requests.get(url)
df = pd.DataFrame(res.json())
# Logika cleaning atau upload ke BigQuery ditaruh di sini
output_path = "/opt/airflow/logs/data_akademik.csv"
df.to_csv(output_path, index=False)
return f"Berhasil menarik {len(df)} data."