init commit

2025-08-31 21:09:10 -03:00
commit 1e21624588
6 changed files with 222 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,59 @@
+### What is this
+
+This is a file parser for rclone size output.
+if you output rclone to a file as such
+`rclone ls wikimedia: > size_output.txt`
+you'll get a list of file size and file path for all files from that repository.
+
+by using this python script like so:
+`python3 main.py ./size_output.txt https://dumps.wikimedia.org/other/kiwix/zim/ 500`
+You'll get all those links sorted, completed by a give link, and chunked by the amount of gbs you desire to chunk them.
+into a folder divided by root folders of that repository
+
+python3 main.py ./size_output.txt https://dumps.wikimedia.org/other/kiwix/zim/ 500
+resulting in something like this:
+
+```bash
+{'wikiversity': '10.97 GB', 'wikivoyage': '13.33 GB', 'wikinews': '34.19 GB', 'wikisource': '261.42 GB', 'wikiquote': '8.77 GB', 'wiktionary': '63.32 GB', 'wikibooks': '28.29 GB', 'wikipedia': '2.81 TB'}
+$ tree
+.
+├── chunked
+│   ├── wikibooks
+│   │   ├── wikibooks_chunk_1.txt
+│   │   └── wikibooks_links.txt
+│   ├── wikinews
+│   │   ├── wikinews_chunk_1.txt
+│   │   └── wikinews_links.txt
+│   ├── wikipedia
+│   │   ├── wikipedia_chunk_10.txt
+│   │   ├── wikipedia_chunk_11.txt
+│   │   ├── wikipedia_chunk_12.txt
+│   │   ├── wikipedia_chunk_13.txt
+│   │   ├── wikipedia_chunk_14.txt
+│   │   ├── wikipedia_chunk_1.txt
+│   │   ├── wikipedia_chunk_2.txt
+│   │   ├── wikipedia_chunk_3.txt
+│   │   ├── wikipedia_chunk_4.txt
+│   │   ├── wikipedia_chunk_5.txt
+│   │   ├── wikipedia_chunk_6.txt
+│   │   ├── wikipedia_chunk_7.txt
+│   │   ├── wikipedia_chunk_8.txt
+│   │   ├── wikipedia_chunk_9.txt
+│   │   └── wikipedia_links.txt
+│   ├── wikiquote
+│   │   ├── wikiquote_chunk_1.txt
+│   │   └── wikiquote_links.txt
+│   ├── wikisource
+│   │   ├── wikisource_chunk_1.txt
+│   │   ├── wikisource_chunk_2.txt
+│   │   └── wikisource_links.txt
+│   ├── wikiversity
+│   │   ├── wikiversity_chunk_1.txt
+│   │   └── wikiversity_links.txt
+│   ├── wikivoyage
+│   │   ├── wikivoyage_chunk_1.txt
+│   │   └── wikivoyage_links.txt
+│   └── wiktionary
+│       ├── wiktionary_chunk_1.txt
+│       └── wiktionary_links.txt
+```
--- a/chucker.py
+++ b/chucker.py
@@ -0,0 +1,37 @@
+from rclone_list import rclone_list
+from misc import gb_to_bytes
+import os
+import math
+
+class chunker:
+    def __init__(self, chunk_in_gb:int):
+        self.chunk_gb = chunk_in_gb
+        self.chunk_bytes = gb_to_bytes(chunk_in_gb)
+        pass
+    
+    def filter(self, rc_list:rclone_list):
+        # For each folder category
+        folder_size_dic = {}
+        for folder in rc_list._folders:
+            folder_size_dic[folder] = 0
+
+        for folder in rc_list._folders:
+            #We make an actual folder
+            folder_path = "./chunked/"+folder+"/"
+            os.makedirs(folder_path, exist_ok=True)
+            #And open a file there
+            with open(folder_path+folder+"_links.txt", 'a') as file:
+                for entry in rc_list._entry_list:
+                    # Filter only links
+                    if(folder==entry.getFolder()):
+                        #Write to master file
+                        file.write(entry.getLink()+'\n')
+                        
+                        chunk_ext = math.floor(1+folder_size_dic[folder]/self.chunk_bytes)
+                        folder_size_dic[folder] = folder_size_dic[folder] + entry.getSize()
+                        with open(folder_path+folder+"_chunk_"+str(chunk_ext)+".txt", 'a') as chunked_file:
+                            chunked_file.write(entry.getLink()+'\n')
+
+        
+
+        pass
--- a/main.py
+++ b/main.py
@@ -0,0 +1,31 @@
+
+import sys
+from rclone_list import rclone_list
+from chucker import chunker
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python3 main.py <input_file> <base_url> [chunk_size_gb]")
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+    base_url = sys.argv[2]
+    
+    base_url = base_url.removeprefix("https://")
+    chunk_size_gb = float(sys.argv[3]) if len(sys.argv) >= 4 else None
+
+    ## Creates rclone entry list
+    rc_list : rclone_list = rclone_list()
+
+    ## Read each line of given file, send it to rclone_list to be parsed and added
+    with open(input_file, 'r', encoding='utf-8') as file:
+        for line in file:
+            # Remove possible white spaces rclone leaves for whatever reason
+            line = line.strip()
+            rc_list.addLine(line, base_url)
+
+    rc_list.updateFolders()
+    print(rc_list._folders_size_human)
+    
+    chun : chunker = chunker(chunk_size_gb)
+    chun.filter(rc_list)
--- a/misc.py
+++ b/misc.py
@@ -0,0 +1,17 @@
+import urllib.parse
+
+def bytes_to_gb(size:int)->int:
+    return size * (1024**3)
+
+def gb_to_bytes(size:int)->int:
+    return size * 1024 ** 3
+
+def bytes_to_human_size(size:int)->int:
+    for unit in ["B","KB","MB","GB","TB","PB"]:
+        if(size<1024):
+            return f"{size:.2f} {unit}"
+        size/=1024
+    return f"{size:.2f} EB"
+
+def clean_link(path):
+    return urllib.parse.quote(path)
--- a/rclone_entry.py
+++ b/rclone_entry.py
@@ -0,0 +1,28 @@
+from misc import clean_link
+
+class rclone_entry:
+    def __init__(self):
+        self._link : str = "" 
+        self._folder : str = ""
+        self._path : str = ""
+        self._size : int = -1
+        pass
+    
+    def __init__(self,link:str,folder:str,path:str,size:str):
+        self._folder : str = folder
+        self._path : str = path
+        self._size : int = size
+        self._link : str = "https://" + clean_link(link + path)
+        pass
+    
+    def getLink(self, full : bool = True)->str:
+        return self._link
+    
+    def getFolder(self)->str:
+        return self._folder
+
+    def getPath(self)->str:
+        return self._path
+
+    def getSize(self)->int:
+        return self._size
--- a/rclone_list.py
+++ b/rclone_list.py
@@ -0,0 +1,50 @@
+from rclone_entry import rclone_entry
+import typing
+from misc import bytes_to_human_size
+
+class rclone_list:
+    def __init__(self):
+        self._base_link:str = ""
+        self._entry_list : typing.List[rclone_entry] = []
+        self._folders = []
+        self._folders_size : typing.Dict[str,int] = {}
+        self._folders_size_human : typing.Dict[str,str] = {}
+
+    def updateFolders(self):
+        for entry in self._entry_list:
+            if(entry.getFolder() not in self._folders):
+                self._folders.append(entry.getFolder())
+                self._folders_size[entry.getFolder()] = entry.getSize()
+            else:
+                self._folders_size[entry.getFolder()] = self._folders_size[entry.getFolder()] + entry.getSize()
+        
+        for entry in self._folders_size:
+            self._folders_size_human[entry] = bytes_to_human_size(self._folders_size[entry])
+
+
+    def addLine(self, line:str, link:str = "")->None:
+        """_summary_
+        Adds rclone size line to list, parsing it before adding to the list
+        Args:
+            line (_type_): _description_
+        """
+        if len(line.split()) < 2:
+            #error
+            pass
+        
+        parsed_size : int = int(line.split(' ')[0].strip())
+        parsed_path : str = line.split(' ')[1].strip()
+        parsed_folder : str = parsed_path.split('/')[0].strip()
+        # do error checking here maybe
+        if(parsed_size<=0):
+            #Log error, skip
+            print("ERROR, size 0 // ",parsed_folder,parsed_path,parsed_size)
+            return
+        
+        new_entry : rclone_entry = rclone_entry(self._base_link,parsed_folder,parsed_path,parsed_size) if link=="" else rclone_entry(link,parsed_folder,parsed_path,parsed_size)
+        
+        self.add(new_entry)
+    
+    def add(self, entry : rclone_entry)->None:
+        self._entry_list.append(entry)
+