Add option to use consensus weight instead of count

2025-06-27 05:15:54 +02:00 · 2024-07-07 11:56:52 +02:00 · 2024-07-07 11:56:52 +02:00 · 5f51f9448e
commit 5f51f9448e
parent 3a68a988c1
2 changed files with 54 additions and 32 deletions
--- a/README.md
+++ b/README.md
@ -20,11 +20,15 @@ The program takes two arguments: the filename of the consensus and the filename
 python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb
 ```
-A third (optional) parameter can control the density of the clusters.  The default is 1.5 and generally gives nice maps.
+### Optional parameters
 `--eps 1.5`: control the density of the clusters.  The default is 1.5 and generally gives nice maps.
 The higher the value, the bigger the clusters.
 `--weight`: if set, use the consensus weight instead of the count of relays for the size of the clusters.
 ```shell
-python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb 1.5
+python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb --eps 1.5 --weight
 ```
 ## Using OSM data (optional)
--- a/map.py
+++ b/map.py
@ -1,22 +1,34 @@
-import matplotlib.pyplot as plt
+import re
 import matplotlib.colors
 import geoip2.database
 from sklearn.cluster import DBSCAN
 import matplotlib.gridspec as gridspec
 from cartopy.io.img_tiles import *
 from math import log
 import fire
 import geoip2.database
 import matplotlib.colors
 import matplotlib.gridspec as gridspec
 import matplotlib.pyplot as plt
 from cartopy.io.img_tiles import *
 from sklearn.cluster import DBSCAN
 relay_pattern = re.compile(
    '(^r (?P<nickname>\S*) (?P<id>\S*) (?P<digest>\S*) (?P<publication>\S* \S*) (?P<ip>\S*) (?P<orport>\S*) (?P<dirport>\S*)$\n)'
    '(^a (?P<ipv6>\S*)$\n)?'
    '(^s (?P<flags>(\S ?)*)$\n)'
    '(^v (?P<version>.*)$\n)'
    '(^pr .*$\n)?'
    '(?P<weight>^w (Bandwidth=(?P<bandwidth>\d*)).*$\n)'
    '(?P<ports>^p .*$)', re.MULTILINE)
-def cluster_coordinates(coordinates, eps=1.5, min_samples=1):
+def cluster_coordinates(coordinates, eps=1.5, weight=False):
    """
    Use DBSCAN to cluster points and have a readable map
    :param coordinates: list of points (lat, lon)
    :param eps: control the density of the cluster
-    :param min_samples: minimum number of samples in a cluster
+    :param weight: if True, use consensus weight instead of number of relays
    """
-    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+    dbscan = DBSCAN(eps=eps, min_samples=1)
-    dbscan.fit(coordinates)
+    lat_lon = coordinates[:, [0, 1]]
    dbscan.fit(lat_lon)
    labels = dbscan.labels_
    cluster_centers = []
    cluster_counts = []
@ -26,12 +38,12 @@ def cluster_coordinates(coordinates, eps=1.5, min_samples=1):
            continue
        cluster_mask = (labels == label)
        cluster_points = coordinates[cluster_mask]
-        cluster_centers.append(np.mean(cluster_points, axis=0))
+        cluster_centers.append(np.mean(cluster_points[:, [0, 1]], axis=0))
-        cluster_counts.append(np.sum(cluster_mask))
+        if weight:
            cluster_counts.append(np.sum(cluster_points[:, [2]]))
        else:
            cluster_counts.append(len(cluster_points))
    cluster_points = coordinates[(labels == -1)]
    cluster_centers += list(cluster_points)
    cluster_counts += [1] * len(cluster_points)
    r = list(zip(cluster_centers, cluster_counts))
    return r, max(cluster_counts), min(cluster_counts)
@ -47,39 +59,41 @@ def geo_ip(ip, reader):
    return [response.location.longitude, response.location.latitude]
-def get_ip_from_consensus(filename):
+def get_details_from_consensus(filename):
    """
    Get the IP addresses of the relays present in the consensus at filename
    :param filename: filename of the consensus
    :return: list of IP of the relays in the consensus
    """
    result = []
-    with open(filename, 'r') as file:
+    with open(filename, 'r') as f:
-        for line in file:
+        for match in relay_pattern.finditer(f.read()):
-            if line.startswith("r "):
+            result.append(match.groupdict())
                fields = line.split()
                if len(fields) >= 7:
                    result.append(fields[6])
    return result
-def main(consensus_file, geoip_data_file, eps=1.5):
+def main(consensus_file, geoip_data_file, eps=1.5, weight=False):
    """
    Create a map based on the consensus_file and geoip_data_file
    :param consensus_file: filename of a Tor consensus, see https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/
    :param geoip_data_file: MaxMind mmdb filename, see https://dev.maxmind.com/geoip/geolite2-free-geolocation-data
    :param eps: control the density of the cluster on the map
    :param weight: if True, use consensus weight instead of number of relays
    """
    print('Reading consensus file')
-    ips = get_ip_from_consensus(consensus_file)
+    relays = get_details_from_consensus(consensus_file)
-    print(f'Found {len(ips)} relays')
+    print(f'Found {len(relays)} relays')
    points = list()
    print('Geocoding IP addresses')
    reader = geoip2.database.Reader(geoip_data_file)
-    for ip in ips:
+    for relay in relays:
-        points.append(geo_ip(ip, reader))
+        p = geo_ip(relay['ip'], reader)
        if p[0] is None or p[1] is None:
            print(f"Could not geocode the following IP: {relay['ip']}. Skipping it")
        else:
            points.append(p + [int(relay['bandwidth'])])
    points = np.array(points)
-    points, vmax, vmin = cluster_coordinates(points, eps=eps)
+    points, vmax, vmin = cluster_coordinates(points, eps=eps, weight=weight)
    fig = plt.figure(figsize=(10, 5))
    gs = gridspec.GridSpec(2, 1, height_ratios=[1, 0.05], figure=fig)
@ -101,8 +115,9 @@ def main(consensus_file, geoip_data_file, eps=1.5):
    cmap = plt.cm.hot
    norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
    div = 2000 if weight else 1
    for pos, count in points:
-        ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count, 10), 2), transform=ccrs.PlateCarree(),
+        ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count/div, 10), 2), transform=ccrs.PlateCarree(),
                color=cmap(norm(count)))
    ax.set_global()
@ -112,6 +127,9 @@ def main(consensus_file, geoip_data_file, eps=1.5):
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    cbar = plt.colorbar(sm, cax=cb_ax, orientation='horizontal')
    if weight:
        cbar.set_label('Consensus weight')
    else:
        cbar.set_label('Number of relays')
    plt.tight_layout()