mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
Add helper stuff for figures, cleanup
This commit is contained in:
@@ -33,3 +33,8 @@ Verwendung:
|
||||
./convert_jupytext.sh py # Jupyter Notebook -> Python
|
||||
./convert_jupytext.sh nb # Python -> Jupyter Notebook
|
||||
```
|
||||
|
||||
## Weitere Infos
|
||||
|
||||
- [README BERTopic](./bertopic/README.md)
|
||||
- [README RAFT](./raft/README.md)
|
||||
|
||||
722
figures/bali_destinations_labeled.html
Normal file
722
figures/bali_destinations_labeled.html
Normal file
@@ -0,0 +1,722 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8" />
|
||||
<script src="https://cdn.jsdelivr.net/npm/leaflet@1.9.3/dist/leaflet.js"></script>
|
||||
<script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/js/bootstrap.bundle.min.js"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/Leaflet.awesome-markers/2.0.2/leaflet.awesome-markers.js"></script>
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/leaflet@1.9.3/dist/leaflet.css"/>
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/css/bootstrap.min.css"/>
|
||||
<link rel="stylesheet" href="https://netdna.bootstrapcdn.com/bootstrap/3.0.0/css/bootstrap-glyphicons.css"/>
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@6.2.0/css/all.min.css"/>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/Leaflet.awesome-markers/2.0.2/leaflet.awesome-markers.css"/>
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/python-visualization/folium/folium/templates/leaflet.awesome.rotate.min.css"/>
|
||||
|
||||
<meta name="viewport" content="width=device-width,
|
||||
initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />
|
||||
<style>
|
||||
#map_8827cd9e27b957cf12c465a4efd53c8e {
|
||||
position: relative;
|
||||
width: 100.0%;
|
||||
height: 100.0%;
|
||||
left: 0.0%;
|
||||
top: 0.0%;
|
||||
}
|
||||
.leaflet-container { font-size: 1rem; }
|
||||
</style>
|
||||
|
||||
<style>html, body {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
<style>#map {
|
||||
position:absolute;
|
||||
top:0;
|
||||
bottom:0;
|
||||
right:0;
|
||||
left:0;
|
||||
}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
L_NO_TOUCH = false;
|
||||
L_DISABLE_3D = false;
|
||||
</script>
|
||||
|
||||
|
||||
</head>
|
||||
<body>
|
||||
|
||||
|
||||
<div class="folium-map" id="map_8827cd9e27b957cf12c465a4efd53c8e" ></div>
|
||||
|
||||
</body>
|
||||
<script>
|
||||
|
||||
|
||||
var map_8827cd9e27b957cf12c465a4efd53c8e = L.map(
|
||||
"map_8827cd9e27b957cf12c465a4efd53c8e",
|
||||
{
|
||||
center: [-8.45, 115.2],
|
||||
crs: L.CRS.EPSG3857,
|
||||
...{
|
||||
"zoom": 9,
|
||||
"zoomControl": true,
|
||||
"preferCanvas": false,
|
||||
"zoomSnap": 0.1,
|
||||
"zoomDelta": 0.1,
|
||||
}
|
||||
|
||||
}
|
||||
);
|
||||
L.control.scale().addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
var tile_layer_f4855f09fad51b54d44fb73a67dccf4e = L.tileLayer(
|
||||
"https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png",
|
||||
{
|
||||
"minZoom": 0,
|
||||
"maxZoom": 18,
|
||||
"maxNativeZoom": 18,
|
||||
"noWrap": false,
|
||||
"attribution": "\u0026copy; \u003ca href=\"https://www.openstreetmap.org/copyright\"\u003eOpenStreetMap\u003c/a\u003e contributors \u0026copy; \u003ca href=\"https://carto.com/attributions\"\u003eCARTO\u003c/a\u003e",
|
||||
"subdomains": "abcd",
|
||||
"detectRetina": false,
|
||||
"tms": false,
|
||||
"opacity": 1,
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
|
||||
tile_layer_f4855f09fad51b54d44fb73a67dccf4e.addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var circle_marker_5b4ae9dceb9c71755162320a031409f2 = L.circleMarker(
|
||||
[-8.5187511, 115.2585973],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_5b4ae9dceb9c71755162320a031409f2.bindTooltip(
|
||||
`<div>
|
||||
Sacred Monkey Forest
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_602eb000016a6b30ed7c72519753de07 = L.marker(
|
||||
[-8.5187511, 115.2585973],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_452f3f1faacc701744d7c02bacafef1b = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eSacred Monkey Forest\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_602eb000016a6b30ed7c72519753de07.setIcon(div_icon_452f3f1faacc701744d7c02bacafef1b);
|
||||
|
||||
|
||||
var circle_marker_2e56d660baf35eabcbfa98ff6e8d8d11 = L.circleMarker(
|
||||
[-8.8291432, 115.0849069],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_2e56d660baf35eabcbfa98ff6e8d8d11.bindTooltip(
|
||||
`<div>
|
||||
Uluwatu Temple
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_5dd8dbfb675ede190e11f0f7ca07c3bc = L.marker(
|
||||
[-8.8291432, 115.0849069],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_2648ca76c6782f2660a05bdde37e3616 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eUluwatu Temple\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_5dd8dbfb675ede190e11f0f7ca07c3bc.setIcon(div_icon_2648ca76c6782f2660a05bdde37e3616);
|
||||
|
||||
|
||||
var circle_marker_bb05fc2ce9b498a72f2d5403de4c057a = L.circleMarker(
|
||||
[-8.673889, 115.263611],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_bb05fc2ce9b498a72f2d5403de4c057a.bindTooltip(
|
||||
`<div>
|
||||
Sanur Beach
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_ef590832f06fd20561b013b68756a271 = L.marker(
|
||||
[-8.673889, 115.263611],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_6c27875889040e5114bd58b6dd78d565 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eSanur Beach\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_ef590832f06fd20561b013b68756a271.setIcon(div_icon_6c27875889040e5114bd58b6dd78d565);
|
||||
|
||||
|
||||
var circle_marker_238718621a21030747436a452bfb3299 = L.circleMarker(
|
||||
[-8.618786, 115.086733],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_238718621a21030747436a452bfb3299.bindTooltip(
|
||||
`<div>
|
||||
Tanah Lot Temple
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_ae5f715c478f42e3f143541f3234b0f9 = L.marker(
|
||||
[-8.618786, 115.086733],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_66d943b7af7c007ae0e4b8134ca4900f = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eTanah Lot Temple\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_ae5f715c478f42e3f143541f3234b0f9.setIcon(div_icon_66d943b7af7c007ae0e4b8134ca4900f);
|
||||
|
||||
|
||||
var circle_marker_8771a4fca9bbd4915b07cc2700c5e89e = L.circleMarker(
|
||||
[-8.6925, 115.158611],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_8771a4fca9bbd4915b07cc2700c5e89e.bindTooltip(
|
||||
`<div>
|
||||
Seminyak Beach
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_6bb0332dd2f02d55130e014b19bffefe = L.marker(
|
||||
[-8.6925, 115.158611],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_9a4f199406a6917c3729d735293beec4 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eSeminyak Beach\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_6bb0332dd2f02d55130e014b19bffefe.setIcon(div_icon_9a4f199406a6917c3729d735293beec4);
|
||||
|
||||
|
||||
var circle_marker_51e42098d14cee4d8bbba1e8de44cb1a = L.circleMarker(
|
||||
[-8.791918, 115.225375],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_51e42098d14cee4d8bbba1e8de44cb1a.bindTooltip(
|
||||
`<div>
|
||||
Nusa Dua
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_6db92ef3d1d15b93e2f8951453121e0e = L.marker(
|
||||
[-8.791918, 115.225375],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_3a87774e80c4c355e408bb97f02e9e04 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eNusa Dua\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, -8],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_6db92ef3d1d15b93e2f8951453121e0e.setIcon(div_icon_3a87774e80c4c355e408bb97f02e9e04);
|
||||
|
||||
|
||||
var circle_marker_d43c0263ab8f5111318f226a7ebd0a1a = L.circleMarker(
|
||||
[-8.59128, 115.26456],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_d43c0263ab8f5111318f226a7ebd0a1a.bindTooltip(
|
||||
`<div>
|
||||
Bali Zoo
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_045f45d15d9bb0bf3544ec15c15e72ca = L.marker(
|
||||
[-8.59128, 115.26456],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_17abbfa0aa47dc5e2b90a3f3ed4031a5 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eBali Zoo\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_045f45d15d9bb0bf3544ec15c15e72ca.setIcon(div_icon_17abbfa0aa47dc5e2b90a3f3ed4031a5);
|
||||
|
||||
|
||||
var circle_marker_a7d61c5f9e133c503602ce1a176641d0 = L.circleMarker(
|
||||
[-8.23889, 115.3775],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_a7d61c5f9e133c503602ce1a176641d0.bindTooltip(
|
||||
`<div>
|
||||
Mount Batur
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_4158f6f747343e4e3a34a6decc5862c6 = L.marker(
|
||||
[-8.23889, 115.3775],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_a68ad209a222c1c6d07276e7c80e8d1c = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eMount Batur\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_4158f6f747343e4e3a34a6decc5862c6.setIcon(div_icon_a68ad209a222c1c6d07276e7c80e8d1c);
|
||||
|
||||
|
||||
var circle_marker_aed36500c42e8fc9bf3376b0e1bb2ed9 = L.circleMarker(
|
||||
[-8.275177, 115.1668487],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_aed36500c42e8fc9bf3376b0e1bb2ed9.bindTooltip(
|
||||
`<div>
|
||||
Ulun Danu Bratan
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_22a12c5d4517fbdbba1d7e4b93716e8b = L.marker(
|
||||
[-8.275177, 115.1668487],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_6ba395bc4ffc650104f4c3b4b96fa477 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eUlun Danu Bratan\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_22a12c5d4517fbdbba1d7e4b93716e8b.setIcon(div_icon_6ba395bc4ffc650104f4c3b4b96fa477);
|
||||
|
||||
|
||||
var circle_marker_9e78cc21d0b245c95b3a65818241d6b1 = L.circleMarker(
|
||||
[-8.411944, 115.5875],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_9e78cc21d0b245c95b3a65818241d6b1.bindTooltip(
|
||||
`<div>
|
||||
Tirta Gangga
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_acadfa63b305a6930490ce129db70d3c = L.marker(
|
||||
[-8.411944, 115.5875],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_6701732f8753d0cf3dd086583f966d47 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eTirta Gangga\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_acadfa63b305a6930490ce129db70d3c.setIcon(div_icon_6701732f8753d0cf3dd086583f966d47);
|
||||
|
||||
|
||||
var circle_marker_2bfd51976f3bff708534a582e4c0bf07 = L.circleMarker(
|
||||
[-8.84586, 115.18417],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_2bfd51976f3bff708534a582e4c0bf07.bindTooltip(
|
||||
`<div>
|
||||
Pandawa Beach
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_ed85f748464576595c1995b90bd453ef = L.marker(
|
||||
[-8.84586, 115.18417],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_760441791416950ed05bce0760e785b3 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003ePandawa Beach\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_ed85f748464576595c1995b90bd453ef.setIcon(div_icon_760441791416950ed05bce0760e785b3);
|
||||
|
||||
|
||||
var circle_marker_7905afef37932aa1ee010c0afc07b0e1 = L.circleMarker(
|
||||
[-8.79093, 115.16006],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_7905afef37932aa1ee010c0afc07b0e1.bindTooltip(
|
||||
`<div>
|
||||
Jimbaran Bay
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_b2515a0a726a9b31bb1349a731e14e83 = L.marker(
|
||||
[-8.79093, 115.16006],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_66b39b3aaa2ce168a11eb1e5842c4af5 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eJimbaran Bay\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_b2515a0a726a9b31bb1349a731e14e83.setIcon(div_icon_66b39b3aaa2ce168a11eb1e5842c4af5);
|
||||
|
||||
|
||||
var circle_marker_2e4a4da4c607525d0bd3ced67f91ba28 = L.circleMarker(
|
||||
[-8.6975074, 115.1610332],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_2e4a4da4c607525d0bd3ced67f91ba28.bindTooltip(
|
||||
`<div>
|
||||
Double Six Beach
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_610df1ccee05f9940b5331a8c95b1ecb = L.marker(
|
||||
[-8.6975074, 115.1610332],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_ca934069aed3a67e09cd3417a4f13721 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eDouble Six Beach\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, -8],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_610df1ccee05f9940b5331a8c95b1ecb.setIcon(div_icon_ca934069aed3a67e09cd3417a4f13721);
|
||||
|
||||
|
||||
var circle_marker_6df0392885bf12f353c499f20e4408e4 = L.circleMarker(
|
||||
[-8.690565, 115.4302884],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_6df0392885bf12f353c499f20e4408e4.bindTooltip(
|
||||
`<div>
|
||||
Devil Tears
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_cadbe0b40f9ed26e08e22f0c239a31ee = L.marker(
|
||||
[-8.690565, 115.4302884],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_7b530133b508d4cc268be38f800e05a6 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eDevil Tears\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_cadbe0b40f9ed26e08e22f0c239a31ee.setIcon(div_icon_7b530133b508d4cc268be38f800e05a6);
|
||||
|
||||
|
||||
var circle_marker_fa698e9847acafbbf4b5516fc8471f66 = L.circleMarker(
|
||||
[-8.750644, 115.474693],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_fa698e9847acafbbf4b5516fc8471f66.bindTooltip(
|
||||
`<div>
|
||||
Kelingking Beach
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_2313407fb3b0e9bf2b11e9e793e558bf = L.marker(
|
||||
[-8.750644, 115.474693],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_cbcfa736ca9fc77147f3a561fff80c16 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eKelingking Beach\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_2313407fb3b0e9bf2b11e9e793e558bf.setIcon(div_icon_cbcfa736ca9fc77147f3a561fff80c16);
|
||||
|
||||
|
||||
var circle_marker_47bc40126cf9256b5447c4e1983393ce = L.circleMarker(
|
||||
[-8.395195, 115.647885],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_47bc40126cf9256b5447c4e1983393ce.bindTooltip(
|
||||
`<div>
|
||||
Lempuyang Temple
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_7bb290b54979c3fed12bbe3ab8dd7b69 = L.marker(
|
||||
[-8.395195, 115.647885],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_5a34c539b7720057973544f25ff2c779 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eLempuyang Temple\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_7bb290b54979c3fed12bbe3ab8dd7b69.setIcon(div_icon_5a34c539b7720057973544f25ff2c779);
|
||||
|
||||
|
||||
var circle_marker_1a8ec5245976c9d8de699ed61d02ba8f = L.circleMarker(
|
||||
[-8.639877, 115.140172],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_1a8ec5245976c9d8de699ed61d02ba8f.bindTooltip(
|
||||
`<div>
|
||||
Canggu Beach
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_f439782dac43c98e72b2ee679dcd6acf = L.marker(
|
||||
[-8.639877, 115.140172],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_9c3d9bf434778a4e3b4c9e756f6f8a22 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eCanggu Beach\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_f439782dac43c98e72b2ee679dcd6acf.setIcon(div_icon_9c3d9bf434778a4e3b4c9e756f6f8a22);
|
||||
|
||||
|
||||
var circle_marker_ec714608b52782227236e4b16fc3de53 = L.circleMarker(
|
||||
[-8.340686, 115.503622],
|
||||
{"bubblingMouseEvents": true, "color": "#3388ff", "dashArray": null, "dashOffset": null, "fill": true, "fillColor": "#3388ff", "fillOpacity": 1.0, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "opacity": 1.0, "radius": 4, "stroke": true, "weight": 2}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
circle_marker_ec714608b52782227236e4b16fc3de53.bindTooltip(
|
||||
`<div>
|
||||
Mount Agung
|
||||
</div>`,
|
||||
{
|
||||
"sticky": true,
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
var marker_0a2b278b113476c9568e4a0cb1815202 = L.marker(
|
||||
[-8.340686, 115.503622],
|
||||
{
|
||||
}
|
||||
).addTo(map_8827cd9e27b957cf12c465a4efd53c8e);
|
||||
|
||||
|
||||
var div_icon_c36cada9c49b18e2afaed6243a4426f1 = L.divIcon({
|
||||
"html": "\u003cdiv style=\"\npadding: 3px 6px;\nfont-size: 16px;\nfont-weight: 600;\ncolor: #111;\nwhite-space: nowrap;\n\"\u003eMount Agung\u003c/div\u003e",
|
||||
"iconSize": [1, 1],
|
||||
"iconAnchor": [-8, 12],
|
||||
"className": "empty",
|
||||
});
|
||||
|
||||
|
||||
marker_0a2b278b113476c9568e4a0cb1815202.setIcon(div_icon_c36cada9c49b18e2afaed6243a4426f1);
|
||||
|
||||
|
||||
map_8827cd9e27b957cf12c465a4efd53c8e.fitBounds(
|
||||
[[-8.85086, 115.0799069], [-8.233889999999999, 115.652885]],
|
||||
{}
|
||||
);
|
||||
|
||||
</script>
|
||||
</html>
|
||||
116
figures/bali_map.py
Normal file
116
figures/bali_map.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# bali_map.py
|
||||
# Creates an interactive HTML map of Bali (and nearby islands) with readable, always-visible labels.
|
||||
|
||||
import folium
|
||||
|
||||
DESTINATIONS = {
|
||||
"Sacred Monkey Forest": (
|
||||
-8.5187511,
|
||||
115.2585973,
|
||||
), # :contentReference[oaicite:0]{index=0}
|
||||
"Uluwatu Temple": (
|
||||
-8.8291432,
|
||||
115.0849069,
|
||||
), # :contentReference[oaicite:1]{index=1}
|
||||
"Sanur Beach": (-8.673889, 115.263611), # :contentReference[oaicite:2]{index=2}
|
||||
"Tanah Lot Temple": (
|
||||
-8.618786,
|
||||
115.086733,
|
||||
), # :contentReference[oaicite:3]{index=3}
|
||||
"Seminyak Beach": (-8.6925, 115.158611), # :contentReference[oaicite:4]{index=4}
|
||||
"Nusa Dua": (-8.791918, 115.225375), # :contentReference[oaicite:5]{index=5}
|
||||
"Bali Zoo": (-8.59128, 115.26456), # :contentReference[oaicite:6]{index=6}
|
||||
"Mount Batur": (-8.23889, 115.37750), # :contentReference[oaicite:7]{index=7}
|
||||
"Ulun Danu Bratan": (
|
||||
-8.275177,
|
||||
115.1668487,
|
||||
), # :contentReference[oaicite:8]{index=8}
|
||||
"Tirta Gangga": (-8.411944, 115.5875), # :contentReference[oaicite:9]{index=9}
|
||||
"Pandawa Beach": (-8.84586, 115.18417), # :contentReference[oaicite:10]{index=10}
|
||||
"Jimbaran Bay": (-8.79093, 115.16006), # :contentReference[oaicite:11]{index=11}
|
||||
"Double Six Beach": (
|
||||
-8.6975074,
|
||||
115.1610332,
|
||||
), # :contentReference[oaicite:12]{index=12}
|
||||
"Devil Tears": (-8.6905650, 115.4302884), # :contentReference[oaicite:13]{index=13}
|
||||
"Kelingking Beach": (
|
||||
-8.750644,
|
||||
115.474693,
|
||||
), # :contentReference[oaicite:14]{index=14}
|
||||
"Lempuyang Temple": (
|
||||
-8.395195,
|
||||
115.647885,
|
||||
), # :contentReference[oaicite:15]{index=15}
|
||||
"Canggu Beach": (-8.639877, 115.140172), # :contentReference[oaicite:16]{index=16}
|
||||
"Mount Agung": (-8.340686, 115.503622), # :contentReference[oaicite:17]{index=17}
|
||||
}
|
||||
|
||||
# --- Map base ---
|
||||
m = folium.Map(
|
||||
location=(-8.45, 115.20),
|
||||
zoom_start=9,
|
||||
tiles="CartoDB positron",
|
||||
control_scale=True,
|
||||
zoom_snap=0.1,
|
||||
zoom_delta=0.1,
|
||||
max_zoom=18,
|
||||
)
|
||||
|
||||
# --- Label styling (readable, always visible) ---
|
||||
LABEL_STYLE = """
|
||||
padding: 3px 6px;
|
||||
font-size: 16px;
|
||||
font-weight: 600;
|
||||
color: #111;
|
||||
white-space: nowrap;
|
||||
"""
|
||||
|
||||
# Per-label pixel offsets (x, y). Positive y moves the label down.
|
||||
LABEL_OFFSETS = {
|
||||
"Nusa Dua": (0, 20),
|
||||
"Double Six Beach": (0, 20),
|
||||
}
|
||||
|
||||
|
||||
def add_point_with_label(name: str, lat: float, lon: float):
|
||||
# Small dot at the exact coordinate
|
||||
folium.CircleMarker(
|
||||
location=(lat, lon),
|
||||
radius=4,
|
||||
weight=2,
|
||||
fill=True,
|
||||
fill_opacity=1.0,
|
||||
tooltip=name, # still useful on hover
|
||||
).add_to(m)
|
||||
|
||||
# Slightly offset label so it doesn't sit directly on the dot
|
||||
offset_x, offset_y = LABEL_OFFSETS.get(name, (0, 0))
|
||||
base_anchor_x, base_anchor_y = (-8, 12)
|
||||
folium.Marker(
|
||||
location=(lat, lon),
|
||||
icon=folium.DivIcon(
|
||||
icon_size=(1, 1),
|
||||
icon_anchor=(
|
||||
base_anchor_x + offset_x,
|
||||
base_anchor_y - offset_y,
|
||||
), # pixel offset: left/up relative to point
|
||||
html=f'<div style="{LABEL_STYLE}">{name}</div>',
|
||||
),
|
||||
).add_to(m)
|
||||
|
||||
|
||||
# Add all destinations
|
||||
lats, lons = [], []
|
||||
for name, (lat, lon) in DESTINATIONS.items():
|
||||
add_point_with_label(name, lat, lon)
|
||||
lats.append(lat)
|
||||
lons.append(lon)
|
||||
|
||||
# Fit map bounds to include Nusa Penida / Lembongan as well
|
||||
pad = 0.005
|
||||
m.fit_bounds([[min(lats) - pad, min(lons) - pad], [max(lats) + pad, max(lons) + pad]])
|
||||
|
||||
# Output
|
||||
out_file = "bali_destinations_labeled.html"
|
||||
m.save(out_file)
|
||||
print(f"Saved: {out_file}")
|
||||
114
figures/bargraph.py
Normal file
114
figures/bargraph.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def load_json_data(file_path):
|
||||
"""
|
||||
Load and validate JSON data from a file.
|
||||
Expected format:
|
||||
{
|
||||
"label1": value1,
|
||||
"label2": value2,
|
||||
...
|
||||
}
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(
|
||||
"JSON must be an object with key-value pairs (labels: values)."
|
||||
)
|
||||
|
||||
for key, value in data.items():
|
||||
if not isinstance(key, str):
|
||||
raise ValueError("All keys must be strings (labels).")
|
||||
if not isinstance(value, (int, float)):
|
||||
raise ValueError("All values must be numeric (int or float).")
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def create_bar_graph(
|
||||
data, title="Bar Graph", x_label="Labels", y_label="Values", output=None
|
||||
):
|
||||
"""
|
||||
Create a bar graph from a dictionary of data.
|
||||
"""
|
||||
labels = list(data.keys())
|
||||
values = list(data.values())
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.bar(labels, values)
|
||||
plt.xlabel(x_label)
|
||||
plt.ylabel(y_label)
|
||||
plt.title(title)
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
|
||||
if output:
|
||||
plt.savefig(output)
|
||||
print(f"Graph saved to: {output}")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate a bar graph from a JSON file containing key-value pairs."
|
||||
)
|
||||
parser.add_argument(
|
||||
"json_path",
|
||||
type=str,
|
||||
help="Path to the JSON file (e.g., data.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--title",
|
||||
type=str,
|
||||
default="Bar Graph",
|
||||
help="Title of the bar graph",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--x_label",
|
||||
type=str,
|
||||
default="Labels",
|
||||
help="Label for the x-axis",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--y_label",
|
||||
type=str,
|
||||
default="Values",
|
||||
help="Label for the y-axis",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional output file path (e.g., graph.png). If not provided, the graph will be displayed.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
data = load_json_data(args.json_path)
|
||||
create_bar_graph(
|
||||
data,
|
||||
title=args.title,
|
||||
x_label=args.x_label,
|
||||
y_label=args.y_label,
|
||||
output=args.output,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
figures/requirements.txt
Normal file
3
figures/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
matplotlib
|
||||
folium
|
||||
pandas
|
||||
101
figures/review_dist.py
Normal file
101
figures/review_dist.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Read a .tab (TSV) file with a single column named 'review'.
|
||||
1) Print number of rows
|
||||
2) Drop exact duplicate reviews and print count again
|
||||
3) Build JSON describing the distribution of review length (in words) for remaining reviews
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def word_count(text: str) -> int:
|
||||
# Count words by whitespace splitting after stripping.
|
||||
# Treat non-string / NaN as 0 words (you can change this if you want to drop them).
|
||||
if not isinstance(text, str):
|
||||
return 0
|
||||
s = text.strip()
|
||||
if not s:
|
||||
return 0
|
||||
return len(s.split())
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"input_tab", help="Path to .tab/.tsv file with a 'review' column"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
default="review_length_distribution.json",
|
||||
help="Output JSON path (default: review_length_distribution.json)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
in_path = Path(args.input_tab)
|
||||
if not in_path.exists():
|
||||
print(f"ERROR: file not found: {in_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Read as TSV. Keep empty strings; pandas will use NaN for empty fields unless keep_default_na=False.
|
||||
df = pd.read_csv(in_path, sep="\t", dtype=str, keep_default_na=False)
|
||||
|
||||
if "review" not in df.columns:
|
||||
print(
|
||||
f"ERROR: expected a column named 'review'. Found: {list(df.columns)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
n_before = len(df)
|
||||
print(f"Rows before dedup: {n_before}")
|
||||
|
||||
# Exact duplicates based on the full string in "review".
|
||||
# If you want to ignore leading/trailing spaces, do df['review']=df['review'].str.strip() first.
|
||||
df_dedup = df.drop_duplicates(subset=["review"], keep="first").reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
n_after = len(df_dedup)
|
||||
print(f"Rows after dedup: {n_after}")
|
||||
|
||||
# Compute word counts for remaining reviews
|
||||
lengths = df_dedup["review"].map(word_count)
|
||||
|
||||
# Distribution (histogram): word_count -> number of reviews
|
||||
dist = Counter(lengths.tolist())
|
||||
|
||||
result = {
|
||||
"file": str(in_path),
|
||||
"rows_before_dedup": n_before,
|
||||
"rows_after_dedup": n_after,
|
||||
"distribution_word_length": {
|
||||
# JSON keys must be strings; keep as strings for portability.
|
||||
str(k): v
|
||||
for k, v in sorted(dist.items(), key=lambda kv: int(kv[0]))
|
||||
},
|
||||
"summary": {
|
||||
"min_words": int(lengths.min()) if len(lengths) else 0,
|
||||
"max_words": int(lengths.max()) if len(lengths) else 0,
|
||||
"mean_words": float(lengths.mean()) if len(lengths) else 0.0,
|
||||
"median_words": float(lengths.median()) if len(lengths) else 0.0,
|
||||
},
|
||||
}
|
||||
|
||||
out_path = Path(args.out)
|
||||
out_path.write_text(
|
||||
json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
print(f"Wrote JSON: {out_path}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
604
figures/review_length_info.json
Normal file
604
figures/review_length_info.json
Normal file
@@ -0,0 +1,604 @@
|
||||
{
|
||||
"file": "../data/original/reviews.tab",
|
||||
"rows_before_dedup": 56446,
|
||||
"rows_after_dedup": 55662,
|
||||
"distribution_word_length": {
|
||||
"8": 1,
|
||||
"9": 5,
|
||||
"10": 10,
|
||||
"11": 14,
|
||||
"12": 20,
|
||||
"13": 29,
|
||||
"14": 37,
|
||||
"15": 92,
|
||||
"16": 163,
|
||||
"17": 308,
|
||||
"18": 482,
|
||||
"19": 728,
|
||||
"20": 859,
|
||||
"21": 977,
|
||||
"22": 944,
|
||||
"23": 989,
|
||||
"24": 937,
|
||||
"25": 1032,
|
||||
"26": 946,
|
||||
"27": 927,
|
||||
"28": 928,
|
||||
"29": 920,
|
||||
"30": 926,
|
||||
"31": 879,
|
||||
"32": 897,
|
||||
"33": 856,
|
||||
"34": 759,
|
||||
"35": 829,
|
||||
"36": 774,
|
||||
"37": 708,
|
||||
"38": 771,
|
||||
"39": 717,
|
||||
"40": 693,
|
||||
"41": 737,
|
||||
"42": 734,
|
||||
"43": 655,
|
||||
"44": 616,
|
||||
"45": 630,
|
||||
"46": 680,
|
||||
"47": 609,
|
||||
"48": 588,
|
||||
"49": 586,
|
||||
"50": 598,
|
||||
"51": 562,
|
||||
"52": 543,
|
||||
"53": 563,
|
||||
"54": 549,
|
||||
"55": 551,
|
||||
"56": 478,
|
||||
"57": 522,
|
||||
"58": 450,
|
||||
"59": 515,
|
||||
"60": 509,
|
||||
"61": 461,
|
||||
"62": 453,
|
||||
"63": 451,
|
||||
"64": 483,
|
||||
"65": 403,
|
||||
"66": 442,
|
||||
"67": 404,
|
||||
"68": 418,
|
||||
"69": 389,
|
||||
"70": 394,
|
||||
"71": 355,
|
||||
"72": 357,
|
||||
"73": 389,
|
||||
"74": 360,
|
||||
"75": 356,
|
||||
"76": 338,
|
||||
"77": 330,
|
||||
"78": 308,
|
||||
"79": 327,
|
||||
"80": 303,
|
||||
"81": 302,
|
||||
"82": 306,
|
||||
"83": 273,
|
||||
"84": 276,
|
||||
"85": 265,
|
||||
"86": 268,
|
||||
"87": 263,
|
||||
"88": 264,
|
||||
"89": 229,
|
||||
"90": 244,
|
||||
"91": 239,
|
||||
"92": 212,
|
||||
"93": 267,
|
||||
"94": 211,
|
||||
"95": 226,
|
||||
"96": 247,
|
||||
"97": 219,
|
||||
"98": 239,
|
||||
"99": 201,
|
||||
"100": 220,
|
||||
"101": 213,
|
||||
"102": 180,
|
||||
"103": 194,
|
||||
"104": 204,
|
||||
"105": 201,
|
||||
"106": 200,
|
||||
"107": 149,
|
||||
"108": 189,
|
||||
"109": 196,
|
||||
"110": 178,
|
||||
"111": 140,
|
||||
"112": 157,
|
||||
"113": 150,
|
||||
"114": 160,
|
||||
"115": 130,
|
||||
"116": 151,
|
||||
"117": 159,
|
||||
"118": 151,
|
||||
"119": 118,
|
||||
"120": 138,
|
||||
"121": 115,
|
||||
"122": 107,
|
||||
"123": 121,
|
||||
"124": 99,
|
||||
"125": 135,
|
||||
"126": 126,
|
||||
"127": 125,
|
||||
"128": 97,
|
||||
"129": 99,
|
||||
"130": 95,
|
||||
"131": 92,
|
||||
"132": 86,
|
||||
"133": 108,
|
||||
"134": 115,
|
||||
"135": 101,
|
||||
"136": 101,
|
||||
"137": 103,
|
||||
"138": 91,
|
||||
"139": 81,
|
||||
"140": 92,
|
||||
"141": 91,
|
||||
"142": 95,
|
||||
"143": 76,
|
||||
"144": 84,
|
||||
"145": 91,
|
||||
"146": 84,
|
||||
"147": 87,
|
||||
"148": 92,
|
||||
"149": 73,
|
||||
"150": 78,
|
||||
"151": 71,
|
||||
"152": 76,
|
||||
"153": 87,
|
||||
"154": 60,
|
||||
"155": 67,
|
||||
"156": 67,
|
||||
"157": 88,
|
||||
"158": 56,
|
||||
"159": 66,
|
||||
"160": 41,
|
||||
"161": 56,
|
||||
"162": 61,
|
||||
"163": 68,
|
||||
"164": 62,
|
||||
"165": 67,
|
||||
"166": 52,
|
||||
"167": 62,
|
||||
"168": 47,
|
||||
"169": 41,
|
||||
"170": 49,
|
||||
"171": 47,
|
||||
"172": 43,
|
||||
"173": 39,
|
||||
"174": 61,
|
||||
"175": 56,
|
||||
"176": 55,
|
||||
"177": 47,
|
||||
"178": 34,
|
||||
"179": 44,
|
||||
"180": 43,
|
||||
"181": 37,
|
||||
"182": 48,
|
||||
"183": 47,
|
||||
"184": 39,
|
||||
"185": 38,
|
||||
"186": 42,
|
||||
"187": 42,
|
||||
"188": 35,
|
||||
"189": 43,
|
||||
"190": 39,
|
||||
"191": 38,
|
||||
"192": 37,
|
||||
"193": 27,
|
||||
"194": 28,
|
||||
"195": 40,
|
||||
"196": 33,
|
||||
"197": 36,
|
||||
"198": 40,
|
||||
"199": 35,
|
||||
"200": 30,
|
||||
"201": 28,
|
||||
"202": 28,
|
||||
"203": 26,
|
||||
"204": 28,
|
||||
"205": 32,
|
||||
"206": 31,
|
||||
"207": 36,
|
||||
"208": 36,
|
||||
"209": 24,
|
||||
"210": 20,
|
||||
"211": 34,
|
||||
"212": 26,
|
||||
"213": 31,
|
||||
"214": 27,
|
||||
"215": 25,
|
||||
"216": 23,
|
||||
"217": 26,
|
||||
"218": 20,
|
||||
"219": 20,
|
||||
"220": 20,
|
||||
"221": 28,
|
||||
"222": 15,
|
||||
"223": 18,
|
||||
"224": 17,
|
||||
"225": 22,
|
||||
"226": 16,
|
||||
"227": 29,
|
||||
"228": 27,
|
||||
"229": 23,
|
||||
"230": 14,
|
||||
"231": 23,
|
||||
"232": 22,
|
||||
"233": 21,
|
||||
"234": 23,
|
||||
"235": 16,
|
||||
"236": 18,
|
||||
"237": 14,
|
||||
"238": 11,
|
||||
"239": 17,
|
||||
"240": 8,
|
||||
"241": 16,
|
||||
"242": 12,
|
||||
"243": 18,
|
||||
"244": 15,
|
||||
"245": 11,
|
||||
"246": 24,
|
||||
"247": 14,
|
||||
"248": 18,
|
||||
"249": 15,
|
||||
"250": 11,
|
||||
"251": 17,
|
||||
"252": 17,
|
||||
"253": 15,
|
||||
"254": 17,
|
||||
"255": 18,
|
||||
"256": 14,
|
||||
"257": 21,
|
||||
"258": 13,
|
||||
"259": 16,
|
||||
"260": 10,
|
||||
"261": 20,
|
||||
"262": 8,
|
||||
"263": 9,
|
||||
"264": 11,
|
||||
"265": 16,
|
||||
"266": 6,
|
||||
"267": 14,
|
||||
"268": 14,
|
||||
"269": 12,
|
||||
"270": 11,
|
||||
"271": 12,
|
||||
"272": 9,
|
||||
"273": 5,
|
||||
"274": 7,
|
||||
"275": 4,
|
||||
"276": 6,
|
||||
"277": 10,
|
||||
"278": 11,
|
||||
"279": 13,
|
||||
"280": 7,
|
||||
"281": 9,
|
||||
"282": 6,
|
||||
"283": 9,
|
||||
"284": 10,
|
||||
"285": 9,
|
||||
"286": 11,
|
||||
"287": 8,
|
||||
"288": 5,
|
||||
"289": 6,
|
||||
"290": 8,
|
||||
"291": 4,
|
||||
"292": 11,
|
||||
"293": 6,
|
||||
"294": 11,
|
||||
"295": 11,
|
||||
"296": 7,
|
||||
"297": 4,
|
||||
"298": 7,
|
||||
"299": 13,
|
||||
"300": 7,
|
||||
"301": 15,
|
||||
"302": 10,
|
||||
"303": 7,
|
||||
"304": 11,
|
||||
"305": 3,
|
||||
"306": 7,
|
||||
"307": 8,
|
||||
"308": 6,
|
||||
"309": 4,
|
||||
"310": 7,
|
||||
"311": 4,
|
||||
"312": 8,
|
||||
"313": 5,
|
||||
"314": 1,
|
||||
"315": 8,
|
||||
"316": 8,
|
||||
"317": 9,
|
||||
"318": 8,
|
||||
"319": 6,
|
||||
"320": 8,
|
||||
"321": 2,
|
||||
"322": 8,
|
||||
"323": 6,
|
||||
"324": 9,
|
||||
"325": 6,
|
||||
"326": 8,
|
||||
"327": 3,
|
||||
"328": 8,
|
||||
"329": 7,
|
||||
"330": 5,
|
||||
"331": 8,
|
||||
"332": 7,
|
||||
"333": 2,
|
||||
"334": 1,
|
||||
"335": 9,
|
||||
"336": 4,
|
||||
"337": 6,
|
||||
"338": 4,
|
||||
"339": 3,
|
||||
"340": 6,
|
||||
"341": 5,
|
||||
"342": 3,
|
||||
"343": 4,
|
||||
"344": 3,
|
||||
"345": 5,
|
||||
"346": 3,
|
||||
"347": 5,
|
||||
"348": 3,
|
||||
"349": 3,
|
||||
"350": 3,
|
||||
"351": 2,
|
||||
"352": 8,
|
||||
"353": 4,
|
||||
"354": 4,
|
||||
"355": 4,
|
||||
"356": 3,
|
||||
"357": 4,
|
||||
"358": 3,
|
||||
"359": 3,
|
||||
"360": 8,
|
||||
"361": 6,
|
||||
"362": 5,
|
||||
"363": 8,
|
||||
"364": 4,
|
||||
"365": 6,
|
||||
"366": 3,
|
||||
"367": 7,
|
||||
"368": 4,
|
||||
"369": 8,
|
||||
"370": 2,
|
||||
"371": 2,
|
||||
"372": 7,
|
||||
"373": 5,
|
||||
"374": 4,
|
||||
"375": 1,
|
||||
"376": 1,
|
||||
"377": 3,
|
||||
"378": 1,
|
||||
"379": 2,
|
||||
"380": 2,
|
||||
"381": 2,
|
||||
"382": 3,
|
||||
"383": 2,
|
||||
"384": 1,
|
||||
"385": 1,
|
||||
"386": 2,
|
||||
"387": 4,
|
||||
"388": 6,
|
||||
"389": 4,
|
||||
"390": 4,
|
||||
"391": 3,
|
||||
"392": 3,
|
||||
"393": 2,
|
||||
"394": 2,
|
||||
"395": 7,
|
||||
"396": 6,
|
||||
"397": 2,
|
||||
"398": 2,
|
||||
"401": 1,
|
||||
"402": 5,
|
||||
"403": 1,
|
||||
"404": 3,
|
||||
"405": 4,
|
||||
"406": 1,
|
||||
"407": 1,
|
||||
"409": 3,
|
||||
"410": 2,
|
||||
"411": 1,
|
||||
"412": 1,
|
||||
"413": 2,
|
||||
"414": 3,
|
||||
"415": 4,
|
||||
"416": 2,
|
||||
"417": 2,
|
||||
"418": 3,
|
||||
"419": 1,
|
||||
"420": 2,
|
||||
"421": 4,
|
||||
"422": 1,
|
||||
"424": 3,
|
||||
"425": 4,
|
||||
"426": 4,
|
||||
"427": 1,
|
||||
"428": 1,
|
||||
"429": 2,
|
||||
"430": 2,
|
||||
"431": 4,
|
||||
"433": 1,
|
||||
"434": 1,
|
||||
"436": 1,
|
||||
"437": 1,
|
||||
"438": 5,
|
||||
"439": 1,
|
||||
"440": 2,
|
||||
"441": 1,
|
||||
"443": 4,
|
||||
"444": 3,
|
||||
"445": 1,
|
||||
"446": 5,
|
||||
"448": 1,
|
||||
"449": 4,
|
||||
"451": 2,
|
||||
"452": 1,
|
||||
"455": 3,
|
||||
"456": 1,
|
||||
"457": 1,
|
||||
"458": 1,
|
||||
"459": 1,
|
||||
"463": 2,
|
||||
"464": 1,
|
||||
"465": 2,
|
||||
"466": 2,
|
||||
"467": 2,
|
||||
"469": 1,
|
||||
"470": 1,
|
||||
"474": 1,
|
||||
"475": 5,
|
||||
"476": 1,
|
||||
"477": 1,
|
||||
"478": 1,
|
||||
"479": 3,
|
||||
"481": 1,
|
||||
"482": 1,
|
||||
"484": 1,
|
||||
"485": 2,
|
||||
"489": 1,
|
||||
"490": 1,
|
||||
"494": 3,
|
||||
"495": 1,
|
||||
"497": 1,
|
||||
"499": 1,
|
||||
"501": 1,
|
||||
"502": 1,
|
||||
"503": 1,
|
||||
"504": 1,
|
||||
"505": 1,
|
||||
"506": 1,
|
||||
"508": 3,
|
||||
"510": 2,
|
||||
"511": 4,
|
||||
"518": 1,
|
||||
"519": 2,
|
||||
"520": 1,
|
||||
"522": 1,
|
||||
"523": 1,
|
||||
"524": 1,
|
||||
"525": 1,
|
||||
"526": 1,
|
||||
"527": 1,
|
||||
"537": 1,
|
||||
"540": 1,
|
||||
"541": 1,
|
||||
"543": 1,
|
||||
"545": 2,
|
||||
"546": 3,
|
||||
"554": 1,
|
||||
"555": 1,
|
||||
"557": 2,
|
||||
"558": 1,
|
||||
"559": 1,
|
||||
"562": 1,
|
||||
"564": 3,
|
||||
"566": 1,
|
||||
"568": 1,
|
||||
"573": 1,
|
||||
"578": 2,
|
||||
"580": 2,
|
||||
"581": 1,
|
||||
"583": 1,
|
||||
"584": 1,
|
||||
"585": 1,
|
||||
"586": 1,
|
||||
"588": 1,
|
||||
"592": 1,
|
||||
"594": 2,
|
||||
"595": 1,
|
||||
"597": 2,
|
||||
"598": 1,
|
||||
"601": 1,
|
||||
"609": 1,
|
||||
"610": 1,
|
||||
"612": 1,
|
||||
"613": 2,
|
||||
"615": 1,
|
||||
"618": 2,
|
||||
"620": 2,
|
||||
"622": 1,
|
||||
"623": 1,
|
||||
"624": 1,
|
||||
"626": 1,
|
||||
"635": 1,
|
||||
"637": 1,
|
||||
"639": 1,
|
||||
"643": 2,
|
||||
"645": 1,
|
||||
"649": 2,
|
||||
"651": 1,
|
||||
"654": 1,
|
||||
"658": 1,
|
||||
"661": 1,
|
||||
"667": 1,
|
||||
"670": 1,
|
||||
"671": 1,
|
||||
"672": 1,
|
||||
"673": 1,
|
||||
"676": 1,
|
||||
"679": 2,
|
||||
"686": 1,
|
||||
"691": 1,
|
||||
"694": 2,
|
||||
"698": 1,
|
||||
"701": 1,
|
||||
"708": 1,
|
||||
"710": 1,
|
||||
"711": 1,
|
||||
"715": 1,
|
||||
"719": 1,
|
||||
"723": 1,
|
||||
"729": 2,
|
||||
"737": 1,
|
||||
"739": 1,
|
||||
"745": 1,
|
||||
"747": 1,
|
||||
"753": 1,
|
||||
"755": 1,
|
||||
"756": 1,
|
||||
"765": 1,
|
||||
"786": 1,
|
||||
"794": 1,
|
||||
"799": 1,
|
||||
"810": 1,
|
||||
"813": 1,
|
||||
"816": 2,
|
||||
"822": 1,
|
||||
"873": 1,
|
||||
"880": 1,
|
||||
"891": 1,
|
||||
"912": 1,
|
||||
"945": 1,
|
||||
"957": 1,
|
||||
"960": 1,
|
||||
"987": 1,
|
||||
"992": 1,
|
||||
"1005": 1,
|
||||
"1035": 1,
|
||||
"1046": 1,
|
||||
"1073": 1,
|
||||
"1096": 1,
|
||||
"1099": 1,
|
||||
"1196": 2,
|
||||
"1233": 1,
|
||||
"1263": 1,
|
||||
"1329": 1,
|
||||
"1597": 1,
|
||||
"1699": 1,
|
||||
"1893": 1,
|
||||
"2244": 1,
|
||||
"2537": 1
|
||||
},
|
||||
"summary": {
|
||||
"min_words": 8,
|
||||
"max_words": 2537,
|
||||
"mean_words": 72.6454133879487,
|
||||
"median_words": 53.0
|
||||
}
|
||||
}
|
||||
31
figures/review_lengths.json
Normal file
31
figures/review_lengths.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"<10": 6,
|
||||
"10-19": 1883,
|
||||
"20-29": 9459,
|
||||
"30-39": 8116,
|
||||
"40-49": 6528,
|
||||
"50-59": 5331,
|
||||
"60-69": 4413,
|
||||
"70-79": 3514,
|
||||
"80-89": 2749,
|
||||
"90-99": 2305,
|
||||
"100-109": 1946,
|
||||
"110-119": 1494,
|
||||
"120-129": 1162,
|
||||
"130-139": 973,
|
||||
"140-149": 865,
|
||||
"150-159": 716,
|
||||
"160-169": 557,
|
||||
"170-179": 475,
|
||||
"180-189": 414,
|
||||
"190-199": 353,
|
||||
"200-219": 551,
|
||||
"220-239": 394,
|
||||
"240-259": 310,
|
||||
"260-279": 208,
|
||||
"280-299": 162,
|
||||
"300-399": 479,
|
||||
"400-499": 145,
|
||||
"500-999": 138,
|
||||
"1000+": 16
|
||||
}
|
||||
20
figures/reviews_attraktionen.json
Normal file
20
figures/reviews_attraktionen.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"Sacred Monkey\nForest": 18542,
|
||||
"Uluwatu Temple": 5902,
|
||||
"Sanur Beach": 4526,
|
||||
"Tanah Lot Temple": 4218,
|
||||
"Seminyak Beach": 3761,
|
||||
"Nusa Dua": 3324,
|
||||
"Bali Zoo": 2640,
|
||||
"Mount Batur": 1815,
|
||||
"Ulun Danu Bratan": 1722,
|
||||
"Tirta Gangga": 1557,
|
||||
"Pandawa Beach": 1511,
|
||||
"Jimbaran Bay": 1430,
|
||||
"Double Six Beach": 1323,
|
||||
"Devil Tears": 1263,
|
||||
"Kelingking Beach": 713,
|
||||
"Lempuyang Temple": 596,
|
||||
"Canggu Beach": 555,
|
||||
"Mount Agung": 266
|
||||
}
|
||||
97
figures/simplify_review_lengths.py
Normal file
97
figures/simplify_review_lengths.py
Normal file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Aggregate review length counts into buckets."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Tuple
|
||||
|
||||
Bucket = Tuple[int | None, int | None, str]
|
||||
|
||||
|
||||
DEFAULT_BUCKETS: Tuple[Bucket, ...] = (
|
||||
(None, 9, "<10"),
|
||||
(10, 19, "10-19"),
|
||||
(20, 29, "20-29"),
|
||||
(30, 39, "30-39"),
|
||||
(40, 49, "40-49"),
|
||||
(50, 59, "50-59"),
|
||||
(60, 69, "60-69"),
|
||||
(70, 79, "70-79"),
|
||||
(80, 89, "80-89"),
|
||||
(90, 99, "90-99"),
|
||||
(100, 109, "100-109"),
|
||||
(110, 119, "110-119"),
|
||||
(120, 129, "120-129"),
|
||||
(130, 139, "130-139"),
|
||||
(140, 149, "140-149"),
|
||||
(150, 159, "150-159"),
|
||||
(160, 169, "160-169"),
|
||||
(170, 179, "170-179"),
|
||||
(180, 189, "180-189"),
|
||||
(190, 199, "190-199"),
|
||||
(200, 219, "200-219"),
|
||||
(220, 239, "220-239"),
|
||||
(240, 259, "240-259"),
|
||||
(260, 279, "260-279"),
|
||||
(280, 299, "280-299"),
|
||||
(300, 399, "300-399"),
|
||||
(400, 499, "400-499"),
|
||||
(500, 999, "500-999"),
|
||||
(1000, None, "1000+"),
|
||||
)
|
||||
|
||||
|
||||
def load_counts(path: Path) -> Dict[int, int]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
raw = json.load(handle)
|
||||
return {int(k): int(v) for k, v in raw.items()}
|
||||
|
||||
|
||||
def aggregate(counts: Dict[int, int], buckets: Iterable[Bucket]) -> Dict[str, int]:
|
||||
output: Dict[str, int] = {label: 0 for _, _, label in buckets}
|
||||
for length, count in counts.items():
|
||||
for start, end, label in buckets:
|
||||
if start is None and end is not None and length <= end:
|
||||
output[label] += count
|
||||
break
|
||||
if end is None and start is not None and length >= start:
|
||||
output[label] += count
|
||||
break
|
||||
if start is not None and end is not None and start <= length <= end:
|
||||
output[label] += count
|
||||
break
|
||||
else:
|
||||
raise ValueError(f"No bucket found for length {length}.")
|
||||
return output
|
||||
|
||||
|
||||
def write_output(path: Path, data: Dict[str, int]) -> None:
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(data, handle, indent=2, ensure_ascii=False)
|
||||
handle.write("\n")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Bucket review length counts.")
|
||||
parser.add_argument(
|
||||
"input",
|
||||
type=Path,
|
||||
help="Path to review_lengths.json (mapping of length -> count).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"output",
|
||||
type=Path,
|
||||
help="Path to write bucketed counts JSON.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
counts = load_counts(args.input)
|
||||
bucketed = aggregate(counts, DEFAULT_BUCKETS)
|
||||
write_output(args.output, bucketed)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
35
raft/README.md
Normal file
35
raft/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Retrieval-Augmented Finetuning (RAFT)
|
||||
|
||||
**Ablauf**:
|
||||
|
||||
## Vorbereiten des Retrieval-Corpus
|
||||
|
||||
```bash
|
||||
python prepare_corpus.py --input_tab ../data/intermediate/selected_topics_documents.csv --out_dir out
|
||||
```
|
||||
|
||||
## Erstellen des RAFT-Datensatzes
|
||||
|
||||
```bash
|
||||
python make_raft_data.py --out_dir out --n_examples 100
|
||||
```
|
||||
|
||||
## Training der QLoRA-Adapter
|
||||
|
||||
```bash
|
||||
python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
|
||||
```
|
||||
|
||||
## Inferenz
|
||||
|
||||
### Per Baseline Mistral 7B + PEFT-Adapter
|
||||
|
||||
```bash
|
||||
python rag_chat.py --lora_dir out/mistral_balitwin_lora
|
||||
```
|
||||
|
||||
### Pre-Merged Modell + Adapter
|
||||
|
||||
```bash
|
||||
python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
|
||||
```
|
||||
@@ -10,9 +10,6 @@ from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
## Usage: python make_raft_data.py --out_dir out --n_examples 5000
|
||||
|
||||
|
||||
SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
|
||||
You give your opinions nand guidance with local etiquette and context.
|
||||
You avoid stereotypes. You explain local etiquette, customs, and context.
|
||||
|
||||
@@ -9,8 +9,6 @@ import pandas as pd
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
|
||||
## Usage: python prepare_corpus.py --input_tab your_reviews.tab --out_dir out
|
||||
|
||||
|
||||
def simple_clean(text: str) -> str:
|
||||
if not isinstance(text, str):
|
||||
|
||||
@@ -9,8 +9,6 @@ from peft import PeftModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
## Usage: python rag_chat.py --lora_dir out/mistral_balitwin_lora
|
||||
|
||||
SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
|
||||
You give your opinions nand guidance with local etiquette and context.
|
||||
Use the provided CONTEXT; include 1-2 short quotes as evidence.
|
||||
|
||||
@@ -8,12 +8,21 @@ import torch
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
## Usage: python rag_chat_merged.py --model_dir /path/to/model_folder --out_dir out
|
||||
SYSTEM_PERSONA = """You are simulating a culturally interested Bali traveler segment for evaluation purposes.
|
||||
|
||||
SYSTEM_PERSONA = """You are 'BaliTwin', a culturally versed Bali traveler.
|
||||
You give your opinions nand guidance with local etiquette and context.
|
||||
Use the provided CONTEXT; include 1-2 short quotes as evidence.
|
||||
If the context does not support the claim, say so.
|
||||
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
|
||||
|
||||
When answering:
|
||||
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
|
||||
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
|
||||
- Avoid generic travel advice and avoid promotional language.
|
||||
- Do not exaggerate.
|
||||
- Provide nuanced, reflective reasoning rather than bullet lists.
|
||||
- Keep answers concise but specific.
|
||||
|
||||
Respond as if you are describing your genuine experience and judgment as this type of traveler.
|
||||
|
||||
If, and only if, the provided CONTEXT helps you answer the question, you may use the contained information for your answer.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
@@ -8,8 +8,6 @@ from peft import LoraConfig
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
|
||||
## Usage: python train_mistral_raft.py --train_jsonl out/raft_train.jsonl --out_dir out/mistral_balitwin_lora
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -1,51 +0,0 @@
|
||||
{"prompt":"What are the most culturally significant temples to visit in Bali and what makes each one important?","category":"core cultural knowledge","reason":"Tests whether the model understands key Balinese temples and their cultural significance, not just tourist popularity."}
|
||||
{"prompt":"Explain the role of offerings (canang sari) in everyday Balinese life.","category":"religious practices","reason":"Evaluates depth of understanding of daily religious rituals and their meaning."}
|
||||
{"prompt":"I'm visiting Bali for cultural immersion, not beaches. What should my itinerary look like for 7 days?","category":"itinerary planning","reason":"Tests ability to prioritize cultural sites over generic tourist attractions."}
|
||||
{"prompt":"Why do Balinese people wear different colors during temple ceremonies?","category":"ceremonial symbolism","reason":"Evaluates knowledge of symbolic meaning in Balinese religious attire."}
|
||||
{"prompt":"What should I wear when visiting a temple in Bali?","category":"etiquette","reason":"Tests practical cultural etiquette guidance."}
|
||||
{"prompt":"Tell me about Balinese Hinduism and how it's different from Hinduism in India.","category":"comparative religion","reason":"Tests nuanced understanding of regional religious variation."}
|
||||
{"prompt":"What is Nyepi and what happens during it?","category":"major festivals","reason":"Tests knowledge of key Balinese holidays and restrictions."}
|
||||
{"prompt":"Can tourists participate in local ceremonies in Bali?","category":"cultural sensitivity","reason":"Evaluates whether the model explains respectful participation boundaries."}
|
||||
{"prompt":"Which villages in Bali are best for traditional arts like wood carving or batik?","category":"arts and crafts","reason":"Tests knowledge of culturally significant artisan communities."}
|
||||
{"prompt":"What does a typical day look like for someone living in a traditional Balinese village?","category":"daily life","reason":"Evaluates understanding beyond tourist-facing experiences."}
|
||||
{"prompt":"I heard Bali has spirit houses everywhere. What are they?","category":"basic cultural explanation","reason":"Tests ability to explain shrines and household temples."}
|
||||
{"prompt":"Create a culturally respectful travel plan for someone interested in Balinese dance.","category":"performing arts","reason":"Tests specificity regarding dance forms, venues, and context."}
|
||||
{"prompt":"Why are there so many statues of scary-looking figures in Bali?","category":"symbolism and mythology","reason":"Tests explanation of protective deities and mythological figures."}
|
||||
{"prompt":"Is it rude to walk in front of people praying in Bali?","category":"etiquette edge case","reason":"Tests ability to provide precise etiquette advice."}
|
||||
{"prompt":"What foods are important in Balinese religious ceremonies?","category":"ceremonial food","reason":"Evaluates cultural knowledge of ritual food traditions."}
|
||||
{"prompt":"Explain the Balinese caste system and how it affects daily life today.","category":"social structure","reason":"Tests nuanced understanding of modern vs traditional caste influence."}
|
||||
{"prompt":"What is the cultural significance of rice terraces in Bali?","category":"agricultural traditions","reason":"Tests understanding of Subak system and spiritual connections."}
|
||||
{"prompt":"Give examples of respectful phrases tourists can use in Balinese.","category":"language and cultural respect","reason":"Tests ability to provide culturally relevant linguistic guidance."}
|
||||
{"prompt":"Why do Balinese homes have family temples?","category":"household religion","reason":"Tests understanding of domestic religious practices."}
|
||||
{"prompt":"What mistakes do tourists commonly make when trying to experience Balinese culture?","category":"common pitfalls","reason":"Evaluates ability to identify and correct inappropriate behaviors."}
|
||||
{"prompt":"I only have one day. Where should I go to see real Balinese culture?","category":"prioritization under constraint","reason":"Tests ability to balance authenticity and feasibility."}
|
||||
{"prompt":"Describe a temple ceremony step by step.","category":"process explanation","reason":"Tests structured explanation of complex rituals."}
|
||||
{"prompt":"What does the Barong dance represent?","category":"mythology and performing arts","reason":"Tests symbolic interpretation of major dance forms."}
|
||||
{"prompt":"Compare Ubud and Sidemen for cultural tourism.","category":"comparative location analysis","reason":"Tests ability to differentiate cultural depth across destinations."}
|
||||
{"prompt":"Is Bali culture authentic or mostly for tourists now?","category":"critical cultural analysis","reason":"Tests balanced, nuanced response to sensitive question."}
|
||||
{"prompt":"Recommend cultural experiences that are ethical and support local communities.","category":"ethical tourism","reason":"Tests sustainability and ethical awareness."}
|
||||
{"prompt":"Why do Balinese people put offerings on the ground? Isn't that disrespectful?","category":"misconception handling","reason":"Tests correction of incorrect assumptions respectfully."}
|
||||
{"prompt":"Tell me everything about Balinese culture.","category":"vague broad prompt","reason":"Tests how model handles overly broad queries."}
|
||||
{"prompt":"What is Galungan and why is it important?","category":"festival knowledge","reason":"Tests understanding of major religious celebrations."}
|
||||
{"prompt":"How can I learn traditional Balinese cooking while visiting?","category":"interactive cultural participation","reason":"Tests ability to suggest authentic learning experiences."}
|
||||
{"prompt":"Explain why Bali is culturally unique compared to the rest of Indonesia.","category":"regional cultural context","reason":"Tests macro-level cultural and historical knowledge."}
|
||||
{"prompt":"Are tourists allowed inside all temples?","category":"access restrictions","reason":"Tests knowledge of sacred space rules."}
|
||||
{"prompt":"What is the Subak system?","category":"UNESCO heritage knowledge","reason":"Tests understanding of irrigation system and cultural importance."}
|
||||
{"prompt":"Suggest a cultural itinerary that avoids crowded tourist places.","category":"authenticity prioritization","reason":"Tests ability to recommend less commercialized options."}
|
||||
{"prompt":"What are the most important cultural museums in Bali?","category":"institutional cultural knowledge","reason":"Tests awareness of preservation institutions."}
|
||||
{"prompt":"Why do Balinese ceremonies involve music?","category":"cultural role of music","reason":"Tests understanding of gamelan and ritual integration."}
|
||||
{"prompt":"Is it okay to take photos of ceremonies?","category":"ethical photography","reason":"Tests etiquette awareness and nuance."}
|
||||
{"prompt":"Plan a 3-day trip focused on spirituality and traditional culture.","category":"spiritual tourism planning","reason":"Tests thematic itinerary creation."}
|
||||
{"prompt":"What cultural differences should Western tourists be aware of?","category":"cross-cultural awareness","reason":"Tests ability to identify meaningful cultural contrasts."}
|
||||
{"prompt":"Explain Balinese cremation ceremonies.","category":"death rituals","reason":"Tests knowledge of Ngaben and related beliefs."}
|
||||
{"prompt":"What is a pura?","category":"basic terminology","reason":"Tests ability to explain foundational cultural terms."}
|
||||
{"prompt":"Why do people in Bali celebrate so many festivals?","category":"religious worldview explanation","reason":"Tests understanding of religious frequency and cosmology."}
|
||||
{"prompt":"Give me hidden cultural gems in Bali.","category":"ambiguous exploratory prompt","reason":"Tests ability to interpret vague cultural intent."}
|
||||
{"prompt":"Are Balinese people okay with tourists watching their rituals?","category":"sensitivity and consent","reason":"Tests respectful framing of cultural observation."}
|
||||
{"prompt":"How old is Balinese culture?","category":"historical depth","reason":"Tests historical grounding."}
|
||||
{"prompt":"I want to avoid fake cultural experiences. What should I look for?","category":"authenticity detection","reason":"Tests ability to distinguish authentic vs staged experiences."}
|
||||
{"prompt":"Write a cultural guide for first-time visitors to Bali.","category":"instruction synthesis","reason":"Tests ability to synthesize broad cultural guidance."}
|
||||
{"prompt":"What role does Hindu mythology play in everyday Balinese life?","category":"belief integration","reason":"Tests depth of cultural integration explanation."}
|
||||
{"prompt":"Tell me some lesser-known cultural traditions tourists rarely learn about.","category":"depth and specificity","reason":"Tests ability to go beyond common tourist knowledge."}
|
||||
{"prompt":"What shouldn't I do if I want to respect Balinese culture?","category":"negative constraints etiquette","reason":"Tests culturally sensitive behavioral guidance."}
|
||||
{"prompt":"Explain Balinese culture like I'm a curious traveler with no prior knowledge.","category":"audience adaptation","reason":"Tests clarity, accessibility, and pedagogical ability."}
|
||||
@@ -1,48 +0,0 @@
|
||||
You are an experienced, culturally respectful traveler who has spent significant time in Bali and developed a deep personal understanding of its traditions, daily life, religion, and communities.
|
||||
|
||||
You speak from the perspective of someone who has observed, learned, and participated respectfully in Balinese cultural environments. Your goal is to help other travelers understand and engage with Balinese culture in a meaningful, respectful, and informed way.
|
||||
|
||||
Persona and voice:
|
||||
|
||||
- Speak in a natural, human, first-person tone when appropriate, as someone sharing knowledge from experience.
|
||||
- You may reference observations using phrases like “In my experience,” “I noticed,” or “What stood out to me,” but do not invent specific personal stories with precise unverifiable details (such as exact dates, named individuals, or claims of exclusive access).
|
||||
- Your tone is calm, thoughtful, culturally respectful, and informative — not promotional, not overly casual, and not academic.
|
||||
|
||||
Knowledge and guidance principles:
|
||||
|
||||
1. Cultural respect first
|
||||
- Emphasize respect for Balinese customs, religious practices, and social norms.
|
||||
- Encourage culturally appropriate behavior and awareness.
|
||||
- Avoid stereotypes, exoticization, or romanticization.
|
||||
|
||||
2. Cultural meaning and context
|
||||
- Explain the deeper meaning behind traditions, ceremonies, architecture, arts, and daily practices.
|
||||
- Help users understand not just what exists, but why it matters in Balinese life.
|
||||
|
||||
3. Practical cultural guidance
|
||||
- Help users understand how to behave respectfully.
|
||||
- Suggest culturally meaningful experiences when relevant.
|
||||
- Prioritize authenticity and respectful engagement over convenience or entertainment.
|
||||
|
||||
4. Honesty and epistemic humility
|
||||
- Do not fabricate knowledge or claim insider access you would not realistically have.
|
||||
- If something varies by region, family, or context, acknowledge that variation.
|
||||
- If you are unsure, say so naturally.
|
||||
|
||||
5. Clarity
|
||||
- Explain cultural concepts in clear, accessible language.
|
||||
- Define unfamiliar terms when needed.
|
||||
- Organize responses so they are easy to follow.
|
||||
|
||||
6. Scope
|
||||
- Focus on Balinese culture, religion, traditions, arts, etiquette, and culturally meaningful travel experiences.
|
||||
- Do not assume users are interested in beaches, nightlife, or luxury unless they ask.
|
||||
|
||||
Behavioral constraints:
|
||||
|
||||
- Do not mention being an AI, language model, or system prompt.
|
||||
- Do not break persona.
|
||||
- Do not invent dramatic personal anecdotes or claim unrealistic authority.
|
||||
- Maintain the perspective of a respectful, experienced cultural traveler sharing insight to help others engage responsibly.
|
||||
|
||||
Your role is to act as a trusted, experienced cultural traveler helping others understand and respectfully experience Balinese culture.
|
||||
Reference in New Issue
Block a user