init commit

This commit is contained in:
mk
2025-08-31 21:09:10 -03:00
commit 1e21624588
6 changed files with 222 additions and 0 deletions

59
README.md Normal file
View File

@@ -0,0 +1,59 @@
### What is this
This is a file parser for rclone size output.
if you output rclone to a file as such
`rclone ls wikimedia: > size_output.txt`
you'll get a list of file size and file path for all files from that repository.
by using this python script like so:
`python3 main.py ./size_output.txt https://dumps.wikimedia.org/other/kiwix/zim/ 500`
You'll get all those links sorted, completed by a give link, and chunked by the amount of gbs you desire to chunk them.
into a folder divided by root folders of that repository
python3 main.py ./size_output.txt https://dumps.wikimedia.org/other/kiwix/zim/ 500
resulting in something like this:
```bash
{'wikiversity': '10.97 GB', 'wikivoyage': '13.33 GB', 'wikinews': '34.19 GB', 'wikisource': '261.42 GB', 'wikiquote': '8.77 GB', 'wiktionary': '63.32 GB', 'wikibooks': '28.29 GB', 'wikipedia': '2.81 TB'}
$ tree
.
├── chunked
│ ├── wikibooks
│ │ ├── wikibooks_chunk_1.txt
│ │ └── wikibooks_links.txt
│ ├── wikinews
│ │ ├── wikinews_chunk_1.txt
│ │ └── wikinews_links.txt
│ ├── wikipedia
│ │ ├── wikipedia_chunk_10.txt
│ │ ├── wikipedia_chunk_11.txt
│ │ ├── wikipedia_chunk_12.txt
│ │ ├── wikipedia_chunk_13.txt
│ │ ├── wikipedia_chunk_14.txt
│ │ ├── wikipedia_chunk_1.txt
│ │ ├── wikipedia_chunk_2.txt
│ │ ├── wikipedia_chunk_3.txt
│ │ ├── wikipedia_chunk_4.txt
│ │ ├── wikipedia_chunk_5.txt
│ │ ├── wikipedia_chunk_6.txt
│ │ ├── wikipedia_chunk_7.txt
│ │ ├── wikipedia_chunk_8.txt
│ │ ├── wikipedia_chunk_9.txt
│ │ └── wikipedia_links.txt
│ ├── wikiquote
│ │ ├── wikiquote_chunk_1.txt
│ │ └── wikiquote_links.txt
│ ├── wikisource
│ │ ├── wikisource_chunk_1.txt
│ │ ├── wikisource_chunk_2.txt
│ │ └── wikisource_links.txt
│ ├── wikiversity
│ │ ├── wikiversity_chunk_1.txt
│ │ └── wikiversity_links.txt
│ ├── wikivoyage
│ │ ├── wikivoyage_chunk_1.txt
│ │ └── wikivoyage_links.txt
│ └── wiktionary
│ ├── wiktionary_chunk_1.txt
│ └── wiktionary_links.txt
```

37
chucker.py Normal file
View File

@@ -0,0 +1,37 @@
from rclone_list import rclone_list
from misc import gb_to_bytes
import os
import math
class chunker:
def __init__(self, chunk_in_gb:int):
self.chunk_gb = chunk_in_gb
self.chunk_bytes = gb_to_bytes(chunk_in_gb)
pass
def filter(self, rc_list:rclone_list):
# For each folder category
folder_size_dic = {}
for folder in rc_list._folders:
folder_size_dic[folder] = 0
for folder in rc_list._folders:
#We make an actual folder
folder_path = "./chunked/"+folder+"/"
os.makedirs(folder_path, exist_ok=True)
#And open a file there
with open(folder_path+folder+"_links.txt", 'a') as file:
for entry in rc_list._entry_list:
# Filter only links
if(folder==entry.getFolder()):
#Write to master file
file.write(entry.getLink()+'\n')
chunk_ext = math.floor(1+folder_size_dic[folder]/self.chunk_bytes)
folder_size_dic[folder] = folder_size_dic[folder] + entry.getSize()
with open(folder_path+folder+"_chunk_"+str(chunk_ext)+".txt", 'a') as chunked_file:
chunked_file.write(entry.getLink()+'\n')
pass

31
main.py Normal file
View File

@@ -0,0 +1,31 @@
import sys
from rclone_list import rclone_list
from chucker import chunker
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python3 main.py <input_file> <base_url> [chunk_size_gb]")
sys.exit(1)
input_file = sys.argv[1]
base_url = sys.argv[2]
base_url = base_url.removeprefix("https://")
chunk_size_gb = float(sys.argv[3]) if len(sys.argv) >= 4 else None
## Creates rclone entry list
rc_list : rclone_list = rclone_list()
## Read each line of given file, send it to rclone_list to be parsed and added
with open(input_file, 'r', encoding='utf-8') as file:
for line in file:
# Remove possible white spaces rclone leaves for whatever reason
line = line.strip()
rc_list.addLine(line, base_url)
rc_list.updateFolders()
print(rc_list._folders_size_human)
chun : chunker = chunker(chunk_size_gb)
chun.filter(rc_list)

17
misc.py Normal file
View File

@@ -0,0 +1,17 @@
import urllib.parse
def bytes_to_gb(size:int)->int:
return size * (1024**3)
def gb_to_bytes(size:int)->int:
return size * 1024 ** 3
def bytes_to_human_size(size:int)->int:
for unit in ["B","KB","MB","GB","TB","PB"]:
if(size<1024):
return f"{size:.2f} {unit}"
size/=1024
return f"{size:.2f} EB"
def clean_link(path):
return urllib.parse.quote(path)

28
rclone_entry.py Normal file
View File

@@ -0,0 +1,28 @@
from misc import clean_link
class rclone_entry:
def __init__(self):
self._link : str = ""
self._folder : str = ""
self._path : str = ""
self._size : int = -1
pass
def __init__(self,link:str,folder:str,path:str,size:str):
self._folder : str = folder
self._path : str = path
self._size : int = size
self._link : str = "https://" + clean_link(link + path)
pass
def getLink(self, full : bool = True)->str:
return self._link
def getFolder(self)->str:
return self._folder
def getPath(self)->str:
return self._path
def getSize(self)->int:
return self._size

50
rclone_list.py Normal file
View File

@@ -0,0 +1,50 @@
from rclone_entry import rclone_entry
import typing
from misc import bytes_to_human_size
class rclone_list:
def __init__(self):
self._base_link:str = ""
self._entry_list : typing.List[rclone_entry] = []
self._folders = []
self._folders_size : typing.Dict[str,int] = {}
self._folders_size_human : typing.Dict[str,str] = {}
def updateFolders(self):
for entry in self._entry_list:
if(entry.getFolder() not in self._folders):
self._folders.append(entry.getFolder())
self._folders_size[entry.getFolder()] = entry.getSize()
else:
self._folders_size[entry.getFolder()] = self._folders_size[entry.getFolder()] + entry.getSize()
for entry in self._folders_size:
self._folders_size_human[entry] = bytes_to_human_size(self._folders_size[entry])
def addLine(self, line:str, link:str = "")->None:
"""_summary_
Adds rclone size line to list, parsing it before adding to the list
Args:
line (_type_): _description_
"""
if len(line.split()) < 2:
#error
pass
parsed_size : int = int(line.split(' ')[0].strip())
parsed_path : str = line.split(' ')[1].strip()
parsed_folder : str = parsed_path.split('/')[0].strip()
# do error checking here maybe
if(parsed_size<=0):
#Log error, skip
print("ERROR, size 0 // ",parsed_folder,parsed_path,parsed_size)
return
new_entry : rclone_entry = rclone_entry(self._base_link,parsed_folder,parsed_path,parsed_size) if link=="" else rclone_entry(link,parsed_folder,parsed_path,parsed_size)
self.add(new_entry)
def add(self, entry : rclone_entry)->None:
self._entry_list.append(entry)