Skip to content

Commit dcd45a7

Browse files
Create regulamentos.yml
1 parent 47f6721 commit dcd45a7

1 file changed

Lines changed: 149 additions & 0 deletions

File tree

.github/workflows/regulamentos.yml

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
name: Processar regulamentos da ANAC
2+
3+
on:
4+
workflow_dispatch:
5+
schedule:
6+
- cron: '0 0 1 * *' # Executa diariamente à meia-noite
7+
8+
jobs:
9+
processar-dados:
10+
runs-on: ubuntu-latest
11+
12+
steps:
13+
- name: Checkout do código
14+
uses: actions/checkout@v3
15+
with:
16+
fetch-depth: 0
17+
18+
- name: Configurar Python
19+
uses: actions/setup-python@v4
20+
with:
21+
python-version: '3.9'
22+
23+
- name: Instalar dependências
24+
run: |
25+
python -m pip install --upgrade pip
26+
pip install pandas requests openpyxl
27+
28+
- name: Executar script Python
29+
run: |
30+
cat << 'EOF' > processar_dados.py
31+
import concurrent.futures
32+
import requests
33+
import re
34+
import pandas as pd
35+
import io
36+
import random
37+
38+
dfs = []
39+
url = 'https://raw.githubusercontent.com/gabrielmacedoanac/flat-data-anac/main/regulamentos-url-json.csv'
40+
buscar_substituir = 'https://raw.githubusercontent.com/gabrielmacedoanac/flat-data-anac/main/buscar_substituir_valores.xlsx'
41+
42+
dns_servers = [
43+
'8.8.8.8', '8.8.4.4', # Google
44+
]
45+
46+
def baixar_arquivos(url):
47+
dns = random.choice(dns_servers)
48+
proxies = {"https://": f"http://{dns}:443"}
49+
response = requests.get(url.strip(), proxies=proxies)
50+
content = response.content.decode('utf-8')
51+
df = pd.read_json(io.StringIO(content))
52+
return df
53+
54+
def criar_dataframe(url=url):
55+
urls = requests.get(url).text.split()
56+
with concurrent.futures.ThreadPoolExecutor() as executor:
57+
results = [executor.submit(baixar_arquivos, item) for item in urls]
58+
for i, f in enumerate(concurrent.futures.as_completed(results)):
59+
try:
60+
df = f.result()
61+
if df is not None and not df.empty:
62+
dfs.append(df)
63+
else:
64+
print(f"Empty DataFrame for URL {urls[i]}")
65+
except Exception as e:
66+
print(f"Error processing URL {urls[i]}: {str(e)}")
67+
if dfs:
68+
df = pd.concat(dfs, ignore_index=True)
69+
return df
70+
else:
71+
return None
72+
73+
def limpar_dataframe(df):
74+
df.dropna(how='all', inplace=True)
75+
df.drop_duplicates(keep='last', inplace=True)
76+
for col in df.select_dtypes(include=[object]).columns:
77+
df[col] = df[col].map(lambda x: x.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ').strip() if isinstance(x, str) else x)
78+
df['anexos'] = df['anexos'].str.replace('portalhomolog2', 'www').str.replace('@@download/', '/@@display-file/').str.replace(' , ', '|').str.replace(', ', '|').str.replace('||', '|').str.replace(' ', '%20').str.replace('|', ' ')
79+
return df
80+
81+
def criar_tags(df):
82+
df['dados_nao_estruturados'] = df[['ementa','norma']].agg(' | '.join, axis=1)
83+
df['dados_nao_estruturados'] = df['dados_nao_estruturados'].str.casefold()
84+
buscar_substituir_valores = pd.read_excel(buscar_substituir)
85+
buscar_regex = '(?i)' + '|'.join(buscar_substituir_valores['buscar'])
86+
buscar_compiled = re.compile(buscar_regex)
87+
df['tags'] = df['dados_nao_estruturados'].str.findall(buscar_compiled)
88+
89+
def substituir_valores(valor, buscar_substituir_valores):
90+
for i, row in buscar_substituir_valores.iterrows():
91+
buscar = row['buscar']
92+
substituir = row['substituir']
93+
if not pd.isnull(substituir):
94+
valor = re.sub(buscar, str(substituir), valor)
95+
return valor
96+
97+
df['tags'] = df['tags'].astype(str)
98+
df['tags'] = df['tags'].apply(substituir_valores, args=(buscar_substituir_valores,))
99+
df['tags'] = df['tags'].str.findall(buscar_compiled)
100+
df['tags'] = df['tags'].apply(set).str.join("|")
101+
df['tags'] = df['tags'].str.replace("|", ", ", regex=False).str.split(', ').tolist()
102+
df['tags'] = df['tags'].apply(sorted)
103+
df.drop(columns=['dados_nao_estruturados'], inplace=True)
104+
105+
def ordenar_dados(df):
106+
df['data'] = pd.to_datetime(df['data'], format='%d/%m/%Y')
107+
df = df.sort_values(['data', 'tipo_normatico', 'ementa'], ascending=[False, True, True])
108+
df['data'] = df['data'].dt.strftime('%Y-%m-%d')
109+
return df
110+
111+
def salvar_arquivos_github(df):
112+
csv_path = 'regulamentos-anac-tags.csv'
113+
tsv_path = 'regulamentos-anac-tags.tsv'
114+
json_path = 'regulamentos-anac-tags.json'
115+
116+
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
117+
df.to_csv(tsv_path, sep='\t', index=False, encoding='utf-8-sig')
118+
df.to_json(json_path, orient='records', force_ascii=False)
119+
120+
return csv_path, tsv_path, json_path
121+
122+
df = criar_dataframe()
123+
if df is not None:
124+
df_copia = df.copy()
125+
df = limpar_dataframe(df)
126+
criar_tags(df)
127+
df = ordenar_dados(df)
128+
paths = salvar_arquivos_github(df)
129+
print("Arquivos gerados com sucesso:", paths)
130+
else:
131+
print("Falha ao criar DataFrame - nenhum dado foi processado")
132+
EOF
133+
134+
python processar_dados.py
135+
136+
- name: Verificar alterações nos arquivos
137+
id: changes
138+
run: |
139+
echo "Verificando diferenças nos arquivos processados..."
140+
git diff --quiet regulamentos-anac-tags.csv regulamentos-anac-tags.tsv regulamentos-anac-tags.json || echo "changes_detected=true" >> $GITHUB_OUTPUT
141+
142+
- name: Commit e push das alterações
143+
if: steps.changes.outputs.changes_detected == 'true'
144+
run: |
145+
git config --global user.name "GitHub Actions"
146+
git config --global user.email "[email protected]"
147+
git add regulamentos-anac-tags.*
148+
git commit -m "$(date +'%Y-%m-%d %H:%M UTC') Atualização automática"
149+
git push

0 commit comments

Comments
 (0)