commit 3a68a988c1ad8c9c2ee3bd9e2149747141a8cddc Author: Jules Dejaeghere Date: Wed Mar 27 14:59:19 2024 +0100 Add code diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ee886cb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Jules Dejaeghere + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..46e2a7c --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +# Map Tor relays using Cartopy +Create a map showing the geographic location of the Tor relays + +## Setup + +1. Create a venv: `python -m venv venv && source venv/bin/activate` +2. Install requirements: `pip install -r requirements.txt` + +## Get the data to create the map + +You'll need two files: +1. The Tor consensus you want to use. Download one at: https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/ +2. The _GeoLite2 City_ file from MaxMind (not the CSV format). See their website to create an account and get the file: https://dev.maxmind.com/geoip/geolite2-free-geolocation-data + +## Run the program + +The program takes two arguments: the filename of the consensus and the filename of the GeoLite2 mmdb file. + +```shell +python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb +``` + +A third (optional) parameter can control the density of the clusters. The default is 1.5 and generally gives nice maps. +The higher the value, the bigger the clusters. + +```shell +python map.py 2024-03-27-13-00-00-consensus GeoLite2-City.mmdb 1.5 +``` + +## Using OSM data (optional) + +It is possible to use data from OpenStreetMap as background (instead of the default cartopy image). + +Check the code and the `TODO` comment to enable that. If you use OSM data and Mapbox as suggested in the comment, +please use attribute it properly. + +The following attribution line is generally enough: + +> © Mapbox © OpenStreetMap Improve this map + +## Attribution + +This attribution or a similar should be included when you use this script with MaxMind data. + +> This product includes GeoLite2 Data created by MaxMind, available from https://www.maxmind.com + +## Examples of maps + +### Default cartopy background + +![map with cartopy background](img/map.png) + +### With a custom style from MapBox +![map with osm data](img/map_osm.png) \ No newline at end of file diff --git a/img/map.png b/img/map.png new file mode 100644 index 0000000..5b99c1e Binary files /dev/null and b/img/map.png differ diff --git a/img/map_osm.png b/img/map_osm.png new file mode 100644 index 0000000..1c2d680 Binary files /dev/null and b/img/map_osm.png differ diff --git a/map.py b/map.py new file mode 100644 index 0000000..d614545 --- /dev/null +++ b/map.py @@ -0,0 +1,123 @@ +import matplotlib.pyplot as plt +import matplotlib.colors +import geoip2.database +from sklearn.cluster import DBSCAN +import matplotlib.gridspec as gridspec +from cartopy.io.img_tiles import * +from math import log +import fire + + +def cluster_coordinates(coordinates, eps=1.5, min_samples=1): + """ + Use DBSCAN to cluster points and have a readable map + :param coordinates: list of points (lat, lon) + :param eps: control the density of the cluster + :param min_samples: minimum number of samples in a cluster + """ + dbscan = DBSCAN(eps=eps, min_samples=min_samples) + dbscan.fit(coordinates) + labels = dbscan.labels_ + cluster_centers = [] + cluster_counts = [] + unique_labels = set(labels) + for label in unique_labels: + if label == -1: + continue + cluster_mask = (labels == label) + cluster_points = coordinates[cluster_mask] + cluster_centers.append(np.mean(cluster_points, axis=0)) + cluster_counts.append(np.sum(cluster_mask)) + + cluster_points = coordinates[(labels == -1)] + cluster_centers += list(cluster_points) + cluster_counts += [1] * len(cluster_points) + r = list(zip(cluster_centers, cluster_counts)) + return r, max(cluster_counts), min(cluster_counts) + + +def geo_ip(ip, reader): + """ + Geocode IP address using the given reader + :param ip: IP address + :param reader: a geoip2.database.Reader + :return: [lon, lat] location + """ + response = reader.city(ip) + return [response.location.longitude, response.location.latitude] + + +def get_ip_from_consensus(filename): + """ + Get the IP addresses of the relays present in the consensus at filename + :param filename: filename of the consensus + :return: list of IP of the relays in the consensus + """ + result = [] + with open(filename, 'r') as file: + for line in file: + if line.startswith("r "): + fields = line.split() + if len(fields) >= 7: + result.append(fields[6]) + return result + + +def main(consensus_file, geoip_data_file, eps=1.5): + """ + Create a map based on the consensus_file and geoip_data_file + :param consensus_file: filename of a Tor consensus, see https://metrics.torproject.org/collector/recent/relay-descriptors/consensuses/ + :param geoip_data_file: MaxMind mmdb filename, see https://dev.maxmind.com/geoip/geolite2-free-geolocation-data + :param eps: control the density of the cluster on the map + """ + print('Reading consensus file') + ips = get_ip_from_consensus(consensus_file) + print(f'Found {len(ips)} relays') + points = list() + print('Geocoding IP addresses') + reader = geoip2.database.Reader(geoip_data_file) + for ip in ips: + points.append(geo_ip(ip, reader)) + points = np.array(points) + points, vmax, vmin = cluster_coordinates(points, eps=eps) + + fig = plt.figure(figsize=(10, 5)) + gs = gridspec.GridSpec(2, 1, height_ratios=[1, 0.05], figure=fig) + ax = fig.add_subplot(gs[0], projection=ccrs.PlateCarree()) + + ax.stock_img() + ax.coastlines() + + # TODO if you want to use OSM data with Mapbox, create an account and a custom style on Mapbox. + # Then, fill the credentials below, comment the ax.stock_img() and ax.coastlines() lines and + # uncomment the lines below + # see https://docs.mapbox.com/help/tutorials/create-a-custom-style/ + # osm_tiles = MapboxStyleTiles( + # access_token='', + # map_id='', + # username='', + # cache=False) + # ax.add_image(osm_tiles, 4) + + cmap = plt.cm.hot + norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax) + for pos, count in points: + ax.plot(pos[0], pos[1], 'o', markersize=max(4 * log(count, 10), 2), transform=ccrs.PlateCarree(), + color=cmap(norm(count))) + + ax.set_global() + plt.box(False) + ax.set_extent([-170, 180, -60, 85], crs=ccrs.PlateCarree()) + cb_ax = fig.add_subplot(gs[1]) + + sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) + cbar = plt.colorbar(sm, cax=cb_ax, orientation='horizontal') + cbar.set_label('Number of relays') + + plt.tight_layout() + print('Saving map as map.png') + plt.savefig('map.png', dpi=300) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..868c5fb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +matplotlib +geoip2 +scikit-learn +cartopy +fire \ No newline at end of file