mirror of
https://github.com/jdejaegh/cartopy-tor-relays.git
synced 2025-06-26 21:15:40 +02:00
Add option to use consensus weight instead of count
This commit is contained in:
parent
3a68a988c1
commit
5f51f9448e
2 changed files with 54 additions and 32 deletions
|
@ -20,11 +20,15 @@ The program takes two arguments: the filename of the consensus and the filename
|
|||
python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb
|
||||
```
|
||||
|
||||
A third (optional) parameter can control the density of the clusters. The default is 1.5 and generally gives nice maps.
|
||||
### Optional parameters
|
||||
|
||||
`--eps 1.5`: control the density of the clusters. The default is 1.5 and generally gives nice maps.
|
||||
The higher the value, the bigger the clusters.
|
||||
|
||||
`--weight`: if set, use the consensus weight instead of the count of relays for the size of the clusters.
|
||||
|
||||
```shell
|
||||
python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb 1.5
|
||||
python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb --eps 1.5 --weight
|
||||
```
|
||||
|
||||
## Using OSM data (optional)
|
||||
|
|
78
map.py
78
map.py
|
@ -1,22 +1,34 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import matplotlib.colors
|
||||
import geoip2.database
|
||||
from sklearn.cluster import DBSCAN
|
||||
import matplotlib.gridspec as gridspec
|
||||
from cartopy.io.img_tiles import *
|
||||
import re
|
||||
from math import log
|
||||
|
||||
import fire
|
||||
import geoip2.database
|
||||
import matplotlib.colors
|
||||
import matplotlib.gridspec as gridspec
|
||||
import matplotlib.pyplot as plt
|
||||
from cartopy.io.img_tiles import *
|
||||
from sklearn.cluster import DBSCAN
|
||||
|
||||
relay_pattern = re.compile(
|
||||
'(^r (?P<nickname>\S*) (?P<id>\S*) (?P<digest>\S*) (?P<publication>\S* \S*) (?P<ip>\S*) (?P<orport>\S*) (?P<dirport>\S*)$\n)'
|
||||
'(^a (?P<ipv6>\S*)$\n)?'
|
||||
'(^s (?P<flags>(\S ?)*)$\n)'
|
||||
'(^v (?P<version>.*)$\n)'
|
||||
'(^pr .*$\n)?'
|
||||
'(?P<weight>^w (Bandwidth=(?P<bandwidth>\d*)).*$\n)'
|
||||
'(?P<ports>^p .*$)', re.MULTILINE)
|
||||
|
||||
|
||||
def cluster_coordinates(coordinates, eps=1.5, min_samples=1):
|
||||
def cluster_coordinates(coordinates, eps=1.5, weight=False):
|
||||
"""
|
||||
Use DBSCAN to cluster points and have a readable map
|
||||
:param coordinates: list of points (lat, lon)
|
||||
:param eps: control the density of the cluster
|
||||
:param min_samples: minimum number of samples in a cluster
|
||||
:param weight: if True, use consensus weight instead of number of relays
|
||||
"""
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||||
dbscan.fit(coordinates)
|
||||
dbscan = DBSCAN(eps=eps, min_samples=1)
|
||||
lat_lon = coordinates[:, [0, 1]]
|
||||
dbscan.fit(lat_lon)
|
||||
labels = dbscan.labels_
|
||||
cluster_centers = []
|
||||
cluster_counts = []
|
||||
|
@ -26,12 +38,12 @@ def cluster_coordinates(coordinates, eps=1.5, min_samples=1):
|
|||
continue
|
||||
cluster_mask = (labels == label)
|
||||
cluster_points = coordinates[cluster_mask]
|
||||
cluster_centers.append(np.mean(cluster_points, axis=0))
|
||||
cluster_counts.append(np.sum(cluster_mask))
|
||||
cluster_centers.append(np.mean(cluster_points[:, [0, 1]], axis=0))
|
||||
if weight:
|
||||
cluster_counts.append(np.sum(cluster_points[:, [2]]))
|
||||
else:
|
||||
cluster_counts.append(len(cluster_points))
|
||||
|
||||
cluster_points = coordinates[(labels == -1)]
|
||||
cluster_centers += list(cluster_points)
|
||||
cluster_counts += [1] * len(cluster_points)
|
||||
r = list(zip(cluster_centers, cluster_counts))
|
||||
return r, max(cluster_counts), min(cluster_counts)
|
||||
|
||||
|
@ -47,39 +59,41 @@ def geo_ip(ip, reader):
|
|||
return [response.location.longitude, response.location.latitude]
|
||||
|
||||
|
||||
def get_ip_from_consensus(filename):
|
||||
def get_details_from_consensus(filename):
|
||||
"""
|
||||
Get the IP addresses of the relays present in the consensus at filename
|
||||
:param filename: filename of the consensus
|
||||
:return: list of IP of the relays in the consensus
|
||||
"""
|
||||
result = []
|
||||
with open(filename, 'r') as file:
|
||||
for line in file:
|
||||
if line.startswith("r "):
|
||||
fields = line.split()
|
||||
if len(fields) >= 7:
|
||||
result.append(fields[6])
|
||||
with open(filename, 'r') as f:
|
||||
for match in relay_pattern.finditer(f.read()):
|
||||
result.append(match.groupdict())
|
||||
return result
|
||||
|
||||
|
||||
def main(consensus_file, geoip_data_file, eps=1.5):
|
||||
def main(consensus_file, geoip_data_file, eps=1.5, weight=False):
|
||||
"""
|
||||
Create a map based on the consensus_file and geoip_data_file
|
||||
:param consensus_file: filename of a Tor consensus, see https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/
|
||||
:param geoip_data_file: MaxMind mmdb filename, see https://dev.maxmind.com/geoip/geolite2-free-geolocation-data
|
||||
:param eps: control the density of the cluster on the map
|
||||
:param weight: if True, use consensus weight instead of number of relays
|
||||
"""
|
||||
print('Reading consensus file')
|
||||
ips = get_ip_from_consensus(consensus_file)
|
||||
print(f'Found {len(ips)} relays')
|
||||
relays = get_details_from_consensus(consensus_file)
|
||||
print(f'Found {len(relays)} relays')
|
||||
points = list()
|
||||
print('Geocoding IP addresses')
|
||||
reader = geoip2.database.Reader(geoip_data_file)
|
||||
for ip in ips:
|
||||
points.append(geo_ip(ip, reader))
|
||||
for relay in relays:
|
||||
p = geo_ip(relay['ip'], reader)
|
||||
if p[0] is None or p[1] is None:
|
||||
print(f"Could not geocode the following IP: {relay['ip']}. Skipping it")
|
||||
else:
|
||||
points.append(p + [int(relay['bandwidth'])])
|
||||
points = np.array(points)
|
||||
points, vmax, vmin = cluster_coordinates(points, eps=eps)
|
||||
points, vmax, vmin = cluster_coordinates(points, eps=eps, weight=weight)
|
||||
|
||||
fig = plt.figure(figsize=(10, 5))
|
||||
gs = gridspec.GridSpec(2, 1, height_ratios=[1, 0.05], figure=fig)
|
||||
|
@ -101,8 +115,9 @@ def main(consensus_file, geoip_data_file, eps=1.5):
|
|||
|
||||
cmap = plt.cm.hot
|
||||
norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
|
||||
div = 2000 if weight else 1
|
||||
for pos, count in points:
|
||||
ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count, 10), 2), transform=ccrs.PlateCarree(),
|
||||
ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count/div, 10), 2), transform=ccrs.PlateCarree(),
|
||||
color=cmap(norm(count)))
|
||||
|
||||
ax.set_global()
|
||||
|
@ -112,7 +127,10 @@ def main(consensus_file, geoip_data_file, eps=1.5):
|
|||
|
||||
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
|
||||
cbar = plt.colorbar(sm, cax=cb_ax, orientation='horizontal')
|
||||
cbar.set_label('Number of relays')
|
||||
if weight:
|
||||
cbar.set_label('Consensus weight')
|
||||
else:
|
||||
cbar.set_label('Number of relays')
|
||||
|
||||
plt.tight_layout()
|
||||
print('Saving map as map.png')
|
||||
|
|
Loading…
Add table
Reference in a new issue