init commit
This commit is contained in:
59
README.md
Normal file
59
README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
### What is this
|
||||
|
||||
This is a file parser for rclone size output.
|
||||
if you output rclone to a file as such
|
||||
`rclone ls wikimedia: > size_output.txt`
|
||||
you'll get a list of file size and file path for all files from that repository.
|
||||
|
||||
by using this python script like so:
|
||||
`python3 main.py ./size_output.txt https://dumps.wikimedia.org/other/kiwix/zim/ 500`
|
||||
You'll get all those links sorted, completed by a give link, and chunked by the amount of gbs you desire to chunk them.
|
||||
into a folder divided by root folders of that repository
|
||||
|
||||
python3 main.py ./size_output.txt https://dumps.wikimedia.org/other/kiwix/zim/ 500
|
||||
resulting in something like this:
|
||||
|
||||
```bash
|
||||
{'wikiversity': '10.97 GB', 'wikivoyage': '13.33 GB', 'wikinews': '34.19 GB', 'wikisource': '261.42 GB', 'wikiquote': '8.77 GB', 'wiktionary': '63.32 GB', 'wikibooks': '28.29 GB', 'wikipedia': '2.81 TB'}
|
||||
$ tree
|
||||
.
|
||||
├── chunked
|
||||
│ ├── wikibooks
|
||||
│ │ ├── wikibooks_chunk_1.txt
|
||||
│ │ └── wikibooks_links.txt
|
||||
│ ├── wikinews
|
||||
│ │ ├── wikinews_chunk_1.txt
|
||||
│ │ └── wikinews_links.txt
|
||||
│ ├── wikipedia
|
||||
│ │ ├── wikipedia_chunk_10.txt
|
||||
│ │ ├── wikipedia_chunk_11.txt
|
||||
│ │ ├── wikipedia_chunk_12.txt
|
||||
│ │ ├── wikipedia_chunk_13.txt
|
||||
│ │ ├── wikipedia_chunk_14.txt
|
||||
│ │ ├── wikipedia_chunk_1.txt
|
||||
│ │ ├── wikipedia_chunk_2.txt
|
||||
│ │ ├── wikipedia_chunk_3.txt
|
||||
│ │ ├── wikipedia_chunk_4.txt
|
||||
│ │ ├── wikipedia_chunk_5.txt
|
||||
│ │ ├── wikipedia_chunk_6.txt
|
||||
│ │ ├── wikipedia_chunk_7.txt
|
||||
│ │ ├── wikipedia_chunk_8.txt
|
||||
│ │ ├── wikipedia_chunk_9.txt
|
||||
│ │ └── wikipedia_links.txt
|
||||
│ ├── wikiquote
|
||||
│ │ ├── wikiquote_chunk_1.txt
|
||||
│ │ └── wikiquote_links.txt
|
||||
│ ├── wikisource
|
||||
│ │ ├── wikisource_chunk_1.txt
|
||||
│ │ ├── wikisource_chunk_2.txt
|
||||
│ │ └── wikisource_links.txt
|
||||
│ ├── wikiversity
|
||||
│ │ ├── wikiversity_chunk_1.txt
|
||||
│ │ └── wikiversity_links.txt
|
||||
│ ├── wikivoyage
|
||||
│ │ ├── wikivoyage_chunk_1.txt
|
||||
│ │ └── wikivoyage_links.txt
|
||||
│ └── wiktionary
|
||||
│ ├── wiktionary_chunk_1.txt
|
||||
│ └── wiktionary_links.txt
|
||||
```
|
||||
37
chucker.py
Normal file
37
chucker.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from rclone_list import rclone_list
|
||||
from misc import gb_to_bytes
|
||||
import os
|
||||
import math
|
||||
|
||||
class chunker:
|
||||
def __init__(self, chunk_in_gb:int):
|
||||
self.chunk_gb = chunk_in_gb
|
||||
self.chunk_bytes = gb_to_bytes(chunk_in_gb)
|
||||
pass
|
||||
|
||||
def filter(self, rc_list:rclone_list):
|
||||
# For each folder category
|
||||
folder_size_dic = {}
|
||||
for folder in rc_list._folders:
|
||||
folder_size_dic[folder] = 0
|
||||
|
||||
for folder in rc_list._folders:
|
||||
#We make an actual folder
|
||||
folder_path = "./chunked/"+folder+"/"
|
||||
os.makedirs(folder_path, exist_ok=True)
|
||||
#And open a file there
|
||||
with open(folder_path+folder+"_links.txt", 'a') as file:
|
||||
for entry in rc_list._entry_list:
|
||||
# Filter only links
|
||||
if(folder==entry.getFolder()):
|
||||
#Write to master file
|
||||
file.write(entry.getLink()+'\n')
|
||||
|
||||
chunk_ext = math.floor(1+folder_size_dic[folder]/self.chunk_bytes)
|
||||
folder_size_dic[folder] = folder_size_dic[folder] + entry.getSize()
|
||||
with open(folder_path+folder+"_chunk_"+str(chunk_ext)+".txt", 'a') as chunked_file:
|
||||
chunked_file.write(entry.getLink()+'\n')
|
||||
|
||||
|
||||
|
||||
pass
|
||||
31
main.py
Normal file
31
main.py
Normal file
@@ -0,0 +1,31 @@
|
||||
|
||||
import sys
|
||||
from rclone_list import rclone_list
|
||||
from chucker import chunker
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python3 main.py <input_file> <base_url> [chunk_size_gb]")
|
||||
sys.exit(1)
|
||||
|
||||
input_file = sys.argv[1]
|
||||
base_url = sys.argv[2]
|
||||
|
||||
base_url = base_url.removeprefix("https://")
|
||||
chunk_size_gb = float(sys.argv[3]) if len(sys.argv) >= 4 else None
|
||||
|
||||
## Creates rclone entry list
|
||||
rc_list : rclone_list = rclone_list()
|
||||
|
||||
## Read each line of given file, send it to rclone_list to be parsed and added
|
||||
with open(input_file, 'r', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
# Remove possible white spaces rclone leaves for whatever reason
|
||||
line = line.strip()
|
||||
rc_list.addLine(line, base_url)
|
||||
|
||||
rc_list.updateFolders()
|
||||
print(rc_list._folders_size_human)
|
||||
|
||||
chun : chunker = chunker(chunk_size_gb)
|
||||
chun.filter(rc_list)
|
||||
17
misc.py
Normal file
17
misc.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import urllib.parse
|
||||
|
||||
def bytes_to_gb(size:int)->int:
|
||||
return size * (1024**3)
|
||||
|
||||
def gb_to_bytes(size:int)->int:
|
||||
return size * 1024 ** 3
|
||||
|
||||
def bytes_to_human_size(size:int)->int:
|
||||
for unit in ["B","KB","MB","GB","TB","PB"]:
|
||||
if(size<1024):
|
||||
return f"{size:.2f} {unit}"
|
||||
size/=1024
|
||||
return f"{size:.2f} EB"
|
||||
|
||||
def clean_link(path):
|
||||
return urllib.parse.quote(path)
|
||||
28
rclone_entry.py
Normal file
28
rclone_entry.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from misc import clean_link
|
||||
|
||||
class rclone_entry:
|
||||
def __init__(self):
|
||||
self._link : str = ""
|
||||
self._folder : str = ""
|
||||
self._path : str = ""
|
||||
self._size : int = -1
|
||||
pass
|
||||
|
||||
def __init__(self,link:str,folder:str,path:str,size:str):
|
||||
self._folder : str = folder
|
||||
self._path : str = path
|
||||
self._size : int = size
|
||||
self._link : str = "https://" + clean_link(link + path)
|
||||
pass
|
||||
|
||||
def getLink(self, full : bool = True)->str:
|
||||
return self._link
|
||||
|
||||
def getFolder(self)->str:
|
||||
return self._folder
|
||||
|
||||
def getPath(self)->str:
|
||||
return self._path
|
||||
|
||||
def getSize(self)->int:
|
||||
return self._size
|
||||
50
rclone_list.py
Normal file
50
rclone_list.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from rclone_entry import rclone_entry
|
||||
import typing
|
||||
from misc import bytes_to_human_size
|
||||
|
||||
class rclone_list:
|
||||
def __init__(self):
|
||||
self._base_link:str = ""
|
||||
self._entry_list : typing.List[rclone_entry] = []
|
||||
self._folders = []
|
||||
self._folders_size : typing.Dict[str,int] = {}
|
||||
self._folders_size_human : typing.Dict[str,str] = {}
|
||||
|
||||
def updateFolders(self):
|
||||
for entry in self._entry_list:
|
||||
if(entry.getFolder() not in self._folders):
|
||||
self._folders.append(entry.getFolder())
|
||||
self._folders_size[entry.getFolder()] = entry.getSize()
|
||||
else:
|
||||
self._folders_size[entry.getFolder()] = self._folders_size[entry.getFolder()] + entry.getSize()
|
||||
|
||||
for entry in self._folders_size:
|
||||
self._folders_size_human[entry] = bytes_to_human_size(self._folders_size[entry])
|
||||
|
||||
|
||||
def addLine(self, line:str, link:str = "")->None:
|
||||
"""_summary_
|
||||
Adds rclone size line to list, parsing it before adding to the list
|
||||
Args:
|
||||
line (_type_): _description_
|
||||
"""
|
||||
if len(line.split()) < 2:
|
||||
#error
|
||||
pass
|
||||
|
||||
parsed_size : int = int(line.split(' ')[0].strip())
|
||||
parsed_path : str = line.split(' ')[1].strip()
|
||||
parsed_folder : str = parsed_path.split('/')[0].strip()
|
||||
# do error checking here maybe
|
||||
if(parsed_size<=0):
|
||||
#Log error, skip
|
||||
print("ERROR, size 0 // ",parsed_folder,parsed_path,parsed_size)
|
||||
return
|
||||
|
||||
new_entry : rclone_entry = rclone_entry(self._base_link,parsed_folder,parsed_path,parsed_size) if link=="" else rclone_entry(link,parsed_folder,parsed_path,parsed_size)
|
||||
|
||||
self.add(new_entry)
|
||||
|
||||
def add(self, entry : rclone_entry)->None:
|
||||
self._entry_list.append(entry)
|
||||
|
||||
Reference in New Issue
Block a user