Skip to content

Commit

Permalink
Optimize CIDR computation
Browse files Browse the repository at this point in the history
  • Loading branch information
Kseen715 committed Aug 12, 2024
1 parent bcc526d commit afb45d0
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 19 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ We include all known proxies of listed sites to raise chance of correct routing.
- facebook.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- instagram.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- meta - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- netflix - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- openai.com
- x.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- youtube.com - from [[x]](https://github.com/touhidurrr/iplist-youtube?tab=readme-ov-file) [[x]](https://www.gstatic.com/ipranges/goog.json) [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- microsoft - from [[x]](https://networksdb.io/ip-addresses-of/microsoft-corp)
- netflix - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- nhentai.net - from [[x]](https://networksdb.io/domain-to-ips/nhentai.net)
- openai.com
- rutracker.org - from [[x]](https://networksdb.io/)
- x.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
- xhamster.com - from [[x]](https://networksdb.io/)
- xvideos.com - from [[x]](https://networksdb.io/)
- youtube.com - from [[x]](https://github.com/touhidurrr/iplist-youtube?tab=readme-ov-file) [[x]](https://www.gstatic.com/ipranges/goog.json) [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
<div id="auto-sort-end"/>

Not used DBs, but ready to deploy:
Expand Down
21 changes: 6 additions & 15 deletions src/sort_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from os import listdir
from os.path import isfile, join
import pandas as pd


def drop_duplicates(data):
Expand All @@ -14,11 +15,9 @@ def drop_duplicates(data):
not_cidr_data = data[~data['ipv4'].str.contains('/')]
for index, row in cidr_data.iterrows():
try:
ip = ipaddress.ip_network(row['ipv4'], strict=False)
for i in ip.hosts():
if str(i) in not_cidr_data['ipv4'].values:
not_cidr_data = not_cidr_data[~(not_cidr_data['ipv4'].astype(str) == str(i))]
log_info(f'Dropped {str(i)} because it is included in {row["ipv4"]}')
ip_net = ipaddress.ip_network(row['ipv4'], strict=False)
not_cidr_data.loc[:, 'ipv4'] = not_cidr_data['ipv4'].apply(lambda x: None if ipaddress.ip_address(x) in ip_net else x)
not_cidr_data = not_cidr_data.dropna(subset=['ipv4'])
except ValueError as e:
log_warning(f"Invalid CIDR notation {row['ipv4']}: {e}")
# remove invalid CIDR
Expand Down Expand Up @@ -66,25 +65,17 @@ def drop_duplicates_in_known(data):
if len(not_cidr_data) == 0:
log_info("No non-CIDR data found")
else:
cidr_ips = set()
for cidr in cidr_data.iloc[:, 0]:
try:
ip_network = ipaddress.ip_network(cidr, strict=False)
cidr_ips.update(str(ip) for ip in ip_network.hosts())
not_cidr_data['ipv4'] = not_cidr_data.iloc[:, 0].apply(lambda x: None if ipaddress.ip_address(x) in ip_network else x)
not_cidr_data = not_cidr_data.dropna(subset=[not_cidr_data.columns[0]])
except ValueError as e:
log_warning(f"Invalid CIDR notation {cidr}: {e}")
# remove invalid CIDR
cidr_data = cidr_data[~(cidr_data.iloc[:, 0].astype(str) == cidr)]
log_info(f'Dropped {cidr} because it is invalid CIDR notation')


not_cidr_data = not_cidr_data[~not_cidr_data.iloc[:, 0].astype(str).isin(cidr_ips)]

dropped_ips = set(original_data.iloc[:, 0].astype(str)) - set(not_cidr_data.iloc[:, 0].astype(str)) - set(cidr_data.iloc[:, 0].astype(str))
for ip in dropped_ips:
log_info(f'Dropped {ip} because it is included in a CIDR range')


data = pd.concat([not_cidr_data, cidr_data], ignore_index=True)
if len(original_data) != len(data):
log_happy(f"Dropped {len(original_data) - len(data)} duplicate IP addresses")
Expand Down

0 comments on commit afb45d0

Please sign in to comment.