From 5f51f9448efc3092f5612179d7114a2e56fac2ad Mon Sep 17 00:00:00 2001 From: Jules Dejaeghere Date: Sun, 7 Jul 2024 11:56:52 +0200 Subject: [PATCH] Add option to use consensus weight instead of count --- README.md | 8 ++++-- map.py | 78 ++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 46e2a7c..8e1ca77 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,15 @@ The program takes two arguments: the filename of the consensus and the filename python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb ``` -A third (optional) parameter can control the density of the clusters. The default is 1.5 and generally gives nice maps. +### Optional parameters + +`--eps 1.5`: control the density of the clusters. The default is 1.5 and generally gives nice maps. The higher the value, the bigger the clusters. +`--weight`: if set, use the consensus weight instead of the count of relays for the size of the clusters. + ```shell -python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb 1.5 +python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb --eps 1.5 --weight ``` ## Using OSM data (optional) diff --git a/map.py b/map.py index d614545..63f91d0 100644 --- a/map.py +++ b/map.py @@ -1,22 +1,34 @@ -import matplotlib.pyplot as plt -import matplotlib.colors -import geoip2.database -from sklearn.cluster import DBSCAN -import matplotlib.gridspec as gridspec -from cartopy.io.img_tiles import * +import re from math import log + import fire +import geoip2.database +import matplotlib.colors +import matplotlib.gridspec as gridspec +import matplotlib.pyplot as plt +from cartopy.io.img_tiles import * +from sklearn.cluster import DBSCAN + +relay_pattern = re.compile( + '(^r (?P\S*) (?P\S*) (?P\S*) (?P\S* \S*) (?P\S*) (?P\S*) (?P\S*)$\n)' + '(^a (?P\S*)$\n)?' + '(^s (?P(\S ?)*)$\n)' + '(^v (?P.*)$\n)' + '(^pr .*$\n)?' + '(?P^w (Bandwidth=(?P\d*)).*$\n)' + '(?P^p .*$)', re.MULTILINE) -def cluster_coordinates(coordinates, eps=1.5, min_samples=1): +def cluster_coordinates(coordinates, eps=1.5, weight=False): """ Use DBSCAN to cluster points and have a readable map :param coordinates: list of points (lat, lon) :param eps: control the density of the cluster - :param min_samples: minimum number of samples in a cluster + :param weight: if True, use consensus weight instead of number of relays """ - dbscan = DBSCAN(eps=eps, min_samples=min_samples) - dbscan.fit(coordinates) + dbscan = DBSCAN(eps=eps, min_samples=1) + lat_lon = coordinates[:, [0, 1]] + dbscan.fit(lat_lon) labels = dbscan.labels_ cluster_centers = [] cluster_counts = [] @@ -26,12 +38,12 @@ def cluster_coordinates(coordinates, eps=1.5, min_samples=1): continue cluster_mask = (labels == label) cluster_points = coordinates[cluster_mask] - cluster_centers.append(np.mean(cluster_points, axis=0)) - cluster_counts.append(np.sum(cluster_mask)) + cluster_centers.append(np.mean(cluster_points[:, [0, 1]], axis=0)) + if weight: + cluster_counts.append(np.sum(cluster_points[:, [2]])) + else: + cluster_counts.append(len(cluster_points)) - cluster_points = coordinates[(labels == -1)] - cluster_centers += list(cluster_points) - cluster_counts += [1] * len(cluster_points) r = list(zip(cluster_centers, cluster_counts)) return r, max(cluster_counts), min(cluster_counts) @@ -47,39 +59,41 @@ def geo_ip(ip, reader): return [response.location.longitude, response.location.latitude] -def get_ip_from_consensus(filename): +def get_details_from_consensus(filename): """ Get the IP addresses of the relays present in the consensus at filename :param filename: filename of the consensus :return: list of IP of the relays in the consensus """ result = [] - with open(filename, 'r') as file: - for line in file: - if line.startswith("r "): - fields = line.split() - if len(fields) >= 7: - result.append(fields[6]) + with open(filename, 'r') as f: + for match in relay_pattern.finditer(f.read()): + result.append(match.groupdict()) return result -def main(consensus_file, geoip_data_file, eps=1.5): +def main(consensus_file, geoip_data_file, eps=1.5, weight=False): """ Create a map based on the consensus_file and geoip_data_file :param consensus_file: filename of a Tor consensus, see https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/ :param geoip_data_file: MaxMind mmdb filename, see https://dev.maxmind.com/geoip/geolite2-free-geolocation-data :param eps: control the density of the cluster on the map + :param weight: if True, use consensus weight instead of number of relays """ print('Reading consensus file') - ips = get_ip_from_consensus(consensus_file) - print(f'Found {len(ips)} relays') + relays = get_details_from_consensus(consensus_file) + print(f'Found {len(relays)} relays') points = list() print('Geocoding IP addresses') reader = geoip2.database.Reader(geoip_data_file) - for ip in ips: - points.append(geo_ip(ip, reader)) + for relay in relays: + p = geo_ip(relay['ip'], reader) + if p[0] is None or p[1] is None: + print(f"Could not geocode the following IP: {relay['ip']}. Skipping it") + else: + points.append(p + [int(relay['bandwidth'])]) points = np.array(points) - points, vmax, vmin = cluster_coordinates(points, eps=eps) + points, vmax, vmin = cluster_coordinates(points, eps=eps, weight=weight) fig = plt.figure(figsize=(10, 5)) gs = gridspec.GridSpec(2, 1, height_ratios=[1, 0.05], figure=fig) @@ -101,8 +115,9 @@ def main(consensus_file, geoip_data_file, eps=1.5): cmap = plt.cm.hot norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax) + div = 2000 if weight else 1 for pos, count in points: - ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count, 10), 2), transform=ccrs.PlateCarree(), + ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count/div, 10), 2), transform=ccrs.PlateCarree(), color=cmap(norm(count))) ax.set_global() @@ -112,7 +127,10 @@ def main(consensus_file, geoip_data_file, eps=1.5): sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) cbar = plt.colorbar(sm, cax=cb_ax, orientation='horizontal') - cbar.set_label('Number of relays') + if weight: + cbar.set_label('Consensus weight') + else: + cbar.set_label('Number of relays') plt.tight_layout() print('Saving map as map.png')