Add option to use consensus weight instead of count

This commit is contained in:
Jules 2024-07-07 11:56:52 +02:00
parent 3a68a988c1
commit 5f51f9448e
Signed by: jdejaegh
GPG key ID: 99D6D184CA66933A
2 changed files with 54 additions and 32 deletions

View file

@ -20,11 +20,15 @@ The program takes two arguments: the filename of the consensus and the filename
python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb
``` ```
A third (optional) parameter can control the density of the clusters. The default is 1.5 and generally gives nice maps. ### Optional parameters
`--eps 1.5`: control the density of the clusters. The default is 1.5 and generally gives nice maps.
The higher the value, the bigger the clusters. The higher the value, the bigger the clusters.
`--weight`: if set, use the consensus weight instead of the count of relays for the size of the clusters.
```shell ```shell
python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb 1.5 python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb --eps 1.5 --weight
``` ```
## Using OSM data (optional) ## Using OSM data (optional)

76
map.py
View file

@ -1,22 +1,34 @@
import matplotlib.pyplot as plt import re
import matplotlib.colors
import geoip2.database
from sklearn.cluster import DBSCAN
import matplotlib.gridspec as gridspec
from cartopy.io.img_tiles import *
from math import log from math import log
import fire import fire
import geoip2.database
import matplotlib.colors
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from cartopy.io.img_tiles import *
from sklearn.cluster import DBSCAN
relay_pattern = re.compile(
'(^r (?P<nickname>\S*) (?P<id>\S*) (?P<digest>\S*) (?P<publication>\S* \S*) (?P<ip>\S*) (?P<orport>\S*) (?P<dirport>\S*)$\n)'
'(^a (?P<ipv6>\S*)$\n)?'
'(^s (?P<flags>(\S ?)*)$\n)'
'(^v (?P<version>.*)$\n)'
'(^pr .*$\n)?'
'(?P<weight>^w (Bandwidth=(?P<bandwidth>\d*)).*$\n)'
'(?P<ports>^p .*$)', re.MULTILINE)
def cluster_coordinates(coordinates, eps=1.5, min_samples=1): def cluster_coordinates(coordinates, eps=1.5, weight=False):
""" """
Use DBSCAN to cluster points and have a readable map Use DBSCAN to cluster points and have a readable map
:param coordinates: list of points (lat, lon) :param coordinates: list of points (lat, lon)
:param eps: control the density of the cluster :param eps: control the density of the cluster
:param min_samples: minimum number of samples in a cluster :param weight: if True, use consensus weight instead of number of relays
""" """
dbscan = DBSCAN(eps=eps, min_samples=min_samples) dbscan = DBSCAN(eps=eps, min_samples=1)
dbscan.fit(coordinates) lat_lon = coordinates[:, [0, 1]]
dbscan.fit(lat_lon)
labels = dbscan.labels_ labels = dbscan.labels_
cluster_centers = [] cluster_centers = []
cluster_counts = [] cluster_counts = []
@ -26,12 +38,12 @@ def cluster_coordinates(coordinates, eps=1.5, min_samples=1):
continue continue
cluster_mask = (labels == label) cluster_mask = (labels == label)
cluster_points = coordinates[cluster_mask] cluster_points = coordinates[cluster_mask]
cluster_centers.append(np.mean(cluster_points, axis=0)) cluster_centers.append(np.mean(cluster_points[:, [0, 1]], axis=0))
cluster_counts.append(np.sum(cluster_mask)) if weight:
cluster_counts.append(np.sum(cluster_points[:, [2]]))
else:
cluster_counts.append(len(cluster_points))
cluster_points = coordinates[(labels == -1)]
cluster_centers += list(cluster_points)
cluster_counts += [1] * len(cluster_points)
r = list(zip(cluster_centers, cluster_counts)) r = list(zip(cluster_centers, cluster_counts))
return r, max(cluster_counts), min(cluster_counts) return r, max(cluster_counts), min(cluster_counts)
@ -47,39 +59,41 @@ def geo_ip(ip, reader):
return [response.location.longitude, response.location.latitude] return [response.location.longitude, response.location.latitude]
def get_ip_from_consensus(filename): def get_details_from_consensus(filename):
""" """
Get the IP addresses of the relays present in the consensus at filename Get the IP addresses of the relays present in the consensus at filename
:param filename: filename of the consensus :param filename: filename of the consensus
:return: list of IP of the relays in the consensus :return: list of IP of the relays in the consensus
""" """
result = [] result = []
with open(filename, 'r') as file: with open(filename, 'r') as f:
for line in file: for match in relay_pattern.finditer(f.read()):
if line.startswith("r "): result.append(match.groupdict())
fields = line.split()
if len(fields) >= 7:
result.append(fields[6])
return result return result
def main(consensus_file, geoip_data_file, eps=1.5): def main(consensus_file, geoip_data_file, eps=1.5, weight=False):
""" """
Create a map based on the consensus_file and geoip_data_file Create a map based on the consensus_file and geoip_data_file
:param consensus_file: filename of a Tor consensus, see https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/ :param consensus_file: filename of a Tor consensus, see https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/
:param geoip_data_file: MaxMind mmdb filename, see https://dev.maxmind.com/geoip/geolite2-free-geolocation-data :param geoip_data_file: MaxMind mmdb filename, see https://dev.maxmind.com/geoip/geolite2-free-geolocation-data
:param eps: control the density of the cluster on the map :param eps: control the density of the cluster on the map
:param weight: if True, use consensus weight instead of number of relays
""" """
print('Reading consensus file') print('Reading consensus file')
ips = get_ip_from_consensus(consensus_file) relays = get_details_from_consensus(consensus_file)
print(f'Found {len(ips)} relays') print(f'Found {len(relays)} relays')
points = list() points = list()
print('Geocoding IP addresses') print('Geocoding IP addresses')
reader = geoip2.database.Reader(geoip_data_file) reader = geoip2.database.Reader(geoip_data_file)
for ip in ips: for relay in relays:
points.append(geo_ip(ip, reader)) p = geo_ip(relay['ip'], reader)
if p[0] is None or p[1] is None:
print(f"Could not geocode the following IP: {relay['ip']}. Skipping it")
else:
points.append(p + [int(relay['bandwidth'])])
points = np.array(points) points = np.array(points)
points, vmax, vmin = cluster_coordinates(points, eps=eps) points, vmax, vmin = cluster_coordinates(points, eps=eps, weight=weight)
fig = plt.figure(figsize=(10, 5)) fig = plt.figure(figsize=(10, 5))
gs = gridspec.GridSpec(2, 1, height_ratios=[1, 0.05], figure=fig) gs = gridspec.GridSpec(2, 1, height_ratios=[1, 0.05], figure=fig)
@ -101,8 +115,9 @@ def main(consensus_file, geoip_data_file, eps=1.5):
cmap = plt.cm.hot cmap = plt.cm.hot
norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax) norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
div = 2000 if weight else 1
for pos, count in points: for pos, count in points:
ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count, 10), 2), transform=ccrs.PlateCarree(), ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count/div, 10), 2), transform=ccrs.PlateCarree(),
color=cmap(norm(count))) color=cmap(norm(count)))
ax.set_global() ax.set_global()
@ -112,6 +127,9 @@ def main(consensus_file, geoip_data_file, eps=1.5):
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cbar = plt.colorbar(sm, cax=cb_ax, orientation='horizontal') cbar = plt.colorbar(sm, cax=cb_ax, orientation='horizontal')
if weight:
cbar.set_label('Consensus weight')
else:
cbar.set_label('Number of relays') cbar.set_label('Number of relays')
plt.tight_layout() plt.tight_layout()