Codeword's solution to "Mass download list of APKs by Package Names"

Multi threaded<\/strong> python27 program <\/p>\n\n

main.py<\/strong><\/p>\n

import<\/span> threading<\/span>\nimport<\/span> uuid<\/span>\nfrom<\/span> Queue<\/span> import<\/span> Queue<\/span>\nfrom<\/span> spider<\/span> import<\/span> Spider<\/span>\nfrom<\/span> general<\/span> import<\/span> *<\/span>\nimport<\/span> time<\/span>\nimport<\/span> urllib2<\/span>\n\nPROJECT_NAME<\/span> =<\/span> 'downloaded_directory'<\/span>\n\nHOMEPAGE<\/span> =<\/span> 'https://apkpure.com'<\/span>\nAPP_LIST<\/span> =<\/span> 'app_list.txt'<\/span>\n# get the first apk name from text file<\/span>\n\nNUMBER_OF_THREADS<\/span> =<\/span> 4<\/span>\nqueue<\/span> =<\/span> Queue<\/span>()<\/span>\nSpider<\/span>(<\/span>PROJECT_NAME<\/span>,<\/span> HOMEPAGE<\/span>,<\/span> APP_LIST<\/span>)<\/span>\nMAX_REQ<\/span> =<\/span>50<\/span>;<\/span>\nx<\/span>=<\/span>1<\/span>\nthreads<\/span> =<\/span> []<\/span>\ndef<\/span> create_spider<\/span>():<\/span>\n    for<\/span> _<\/span> in<\/span> range<\/span>(<\/span>NUMBER_OF_THREADS<\/span>):<\/span>\n        t<\/span> =<\/span> threading<\/span>.<\/span>Thread<\/span>(<\/span>target<\/span>=<\/span>work<\/span>)<\/span>\n        threads<\/span>.<\/span>append<\/span>(<\/span>t<\/span>)<\/span>\n        t<\/span>.<\/span>daemon<\/span> =<\/span> True<\/span>\n        t<\/span>.<\/span>start<\/span>()<\/span>\n\n\n\ndef<\/span> work<\/span>():<\/span>\n    global<\/span> x<\/span>\n    while<\/span> True<\/span>:<\/span>\n        if<\/span> x<\/span> >=<\/span> MAX_REQ<\/span>:<\/span>\n            x<\/span> =<\/span> 1<\/span>\n            time<\/span>.<\/span>sleep<\/span>(<\/span>5<\/span>)<\/span>\n            print<\/span> &<\/span>quot<\/span>;<\/span>sleeping<\/span> 5<\/span> sec<\/span>&<\/span>quot<\/span>;<\/span>\n        apk<\/span> =<\/span> queue<\/span>.<\/span>get<\/span>()<\/span>\n        Spider<\/span>.<\/span>crawl_page<\/span>(<\/span>threading<\/span>.<\/span>current_thread<\/span>()<\/span>.<\/span>name<\/span>,<\/span> apk<\/span>)<\/span>\n        queue<\/span>.<\/span>task_done<\/span>()<\/span>\n        x<\/span> +=<\/span>1<\/span>\n\n\ndef<\/span> create_jobs<\/span>():<\/span>\n    for<\/span> link<\/span> in<\/span> file_to_set<\/span>(<\/span>APP_LIST<\/span>):<\/span>\n        queue<\/span>.<\/span>put<\/span>(<\/span>link<\/span>)<\/span>\n    queue<\/span>.<\/span>join<\/span>()<\/span>\n    crawl<\/span>()<\/span>\n\n\ndef<\/span> crawl<\/span>():<\/span>\n    queued_links<\/span> =<\/span> file_to_set<\/span>(<\/span>APP_LIST<\/span>)<\/span>\n    if<\/span> len<\/span>(<\/span>queued_links<\/span>)<\/span> ><\/span> 0<\/span>:<\/span>\n        print<\/span>(<\/span>str<\/span>(<\/span>len<\/span>(<\/span>queued_links<\/span>))<\/span> +<\/span> ' links in the queue'<\/span>)<\/span>\n        create_jobs<\/span>()<\/span>\n\ncreate_spider<\/span>()<\/span>\ncrawl<\/span>()<\/span>\n\ndef<\/span> download_apk<\/span>():<\/span>\n    with<\/span> open<\/span>(<\/span>'crawled_list.txt'<\/span>)<\/span> as<\/span> f<\/span>:<\/span>\n        for<\/span> line<\/span> in<\/span> f<\/span>:<\/span>\n\n\ndownload_apk<\/span>()<\/span>\n\n**<\/span>spider<\/span>.<\/span>py<\/span>**<\/span>\n<\/pre><\/div>\n

from bs4 import BeautifulSoup
\nimport requests
\nfrom general import *<\/p>\n\n

class Spider:<\/p>\n

project_name<\/span> <\/span>=<\/span> <\/span>''<\/span>\nqueue_file<\/span> <\/span>=<\/span> <\/span>''<\/span>\ncrawled_file<\/span> <\/span>=<\/span> <\/span>''<\/span>\nsearch_page<\/span> <\/span>=<\/span> <\/span>''<\/span>\nqueue<\/span> <\/span>=<\/span> <\/span>set<\/span>()<\/span>\ncrawled<\/span> <\/span>=<\/span> <\/span>set<\/span>()<\/span>\n\ndef<\/span> <\/span>__init__<\/span>(<\/span>self<\/span>,<\/span> <\/span>project_name<\/span>,<\/span> <\/span>search_page<\/span>,<\/span> <\/span>app_list<\/span>)<\/span>:<\/span>\n    <\/span>Spider<\/span>.<\/span>project_name<\/span> <\/span>=<\/span> <\/span>project_name<\/span>\n    <\/span>Spider<\/span>.<\/span>search_page<\/span> <\/span>=<\/span> <\/span>search_page<\/span>\n\n    <\/span>Spider<\/span>.<\/span>queue_file<\/span> <\/span>=<\/span> <\/span>app_list<\/span>\n    <\/span>Spider<\/span>.<\/span>crawled_file<\/span> <\/span>=<\/span> <\/span>'crawled_list.txt'<\/span>\n    <\/span>self<\/span>.<\/span>boot<\/span>()<\/span>\n    <\/span>#<\/span>self<\/span>.<\/span>crawl_page<\/span>(<\/span>'Pioneer spider', Spider.base_apk)<\/span>\n\n\n@<\/span>staticmethod<\/span>\ndef<\/span> <\/span>boot<\/span>()<\/span>:<\/span>\n    <\/span>create_project_dir<\/span>(<\/span>Spider<\/span>.<\/span>project_name<\/span>)<\/span>\n    <\/span>create_crawled_list<\/span>(<\/span>Spider<\/span>.<\/span>crawled_file<\/span>)<\/span>\n    <\/span>Spider<\/span>.<\/span>queue<\/span> <\/span>=<\/span> <\/span>file_to_set<\/span>(<\/span>Spider<\/span>.<\/span>queue_file<\/span>)<\/span>\n    <\/span>Spider<\/span>.<\/span>crawled<\/span> <\/span>=<\/span> <\/span>file_to_set<\/span>(<\/span>Spider<\/span>.<\/span>crawled_file<\/span>)<\/span>\n\n\n@<\/span>staticmethod<\/span>\ndef<\/span> <\/span>crawl_page<\/span>(<\/span>thread_name<\/span>,<\/span> <\/span>apk<\/span>)<\/span>:<\/span>\n    <\/span>if<\/span> <\/span>apk<\/span> <\/span>not<\/span> <\/span>in<\/span> <\/span>Spider<\/span>.<\/span>crawled:<\/span>\n        <\/span>print<\/span>(<\/span>thread_name<\/span> <\/span>+<\/span> <\/span>' now crawling ' + apk)<\/span>\n        <\/span>print<\/span>(<\/span>'Queue ' + str(len(Spider.queue)) + ' | Crawled  ' + str(len(Spider.crawled)))<\/span>\n        <\/span>s<\/span> <\/span>=<\/span> <\/span>Spider<\/span>.<\/span>gather_download_link<\/span>(<\/span>Spider<\/span>.<\/span>search_page<\/span>+<\/span>'/search?q=' + apk)<\/span>\n        <\/span>Spider<\/span>.<\/span>add_link_to_queue<\/span>(<\/span>s<\/span>)<\/span>\n        <\/span>Spider<\/span>.<\/span>queue<\/span>.<\/span>remove<\/span>(<\/span>apk<\/span>)<\/span>\n        <\/span>Spider<\/span>.<\/span>update_files<\/span>()<\/span>\n\n\n@<\/span>staticmethod<\/span>\ndef<\/span> <\/span>gather_download_link<\/span>(<\/span>search_url<\/span>)<\/span>:<\/span>\n\n    <\/span>try:<\/span>\n        <\/span>response<\/span> <\/span>=<\/span> <\/span>requests<\/span>.<\/span>get<\/span>(<\/span>search_url<\/span>,<\/span> <\/span>stream<\/span>=<\/span>True<\/span>)<\/span>\n        <\/span>soup<\/span> <\/span>=<\/span> <\/span>BeautifulSoup<\/span>(<\/span>response<\/span>.<\/span>text<\/span>,<\/span> <\/span>&<\/span>quot<\/span>;<\/span>html<\/span>.<\/span>parser&quot<\/span>;)<\/span>\n        <\/span>link_part<\/span> <\/span>=<\/span> <\/span>soup<\/span>.<\/span>findAll<\/span>(<\/span>'a', attrs={'class': 'more-down'})[0]['href']<\/span>\n        <\/span>response_1<\/span> <\/span>=<\/span> <\/span>requests<\/span>.<\/span>get<\/span>(<\/span>Spider<\/span>.<\/span>search_page<\/span>+<\/span>link_part<\/span>+<\/span>'/download?from=details', stream=True)<\/span>\n        <\/span>soup_1<\/span> <\/span>=<\/span> <\/span>BeautifulSoup<\/span>(<\/span>response_1<\/span>.<\/span>text<\/span>,<\/span> <\/span>&<\/span>quot<\/span>;<\/span>html<\/span>.<\/span>parser&quot<\/span>;)<\/span>\n    <\/span>except<\/span> <\/span>Exception<\/span> <\/span>as<\/span> <\/span>e:<\/span>\n        <\/span>print<\/span>(<\/span>str<\/span>(<\/span>e<\/span>))<\/span>\n        <\/span>return<\/span> <\/span>set<\/span>()<\/span>\n    <\/span>return<\/span> <\/span>soup_1<\/span>.<\/span>findAll<\/span>(<\/span>'a', attrs={'id': 'download_link'})[0]['href']<\/span>\n\n\n@<\/span>staticmethod<\/span>\ndef<\/span> <\/span>add_link_to_queue<\/span>(<\/span>link<\/span>)<\/span>:<\/span>\n    <\/span>if<\/span> <\/span>link<\/span> <\/span>not<\/span> <\/span>in<\/span> <\/span>Spider<\/span>.<\/span>crawled:<\/span>\n        <\/span>Spider<\/span>.<\/span>crawled<\/span>.<\/span>add<\/span>(<\/span>link<\/span>)<\/span>\n\n@<\/span>staticmethod<\/span>\ndef<\/span> <\/span>update_files<\/span>()<\/span>:<\/span>\n    <\/span>set_to_file<\/span>(<\/span>Spider<\/span>.<\/span>queue<\/span>,<\/span> <\/span>Spider<\/span>.<\/span>queue_file<\/span>)<\/span>\n    <\/span>set_to_file<\/span>(<\/span>Spider<\/span>.<\/span>crawled<\/span>,<\/span> <\/span>Spider<\/span>.<\/span>crawled_file<\/span>)<\/span>\n<\/pre><\/div>
 <\/span>**<\/span>general<\/span>.<\/span>py<\/span>**<\/span>\n<\/pre><\/div>\n

import os<\/p>\n\n

def create_project_dir(directory):
\n if not os.path.exists(directory):
\n print('Wait Creating directory ' + directory)
\n os.makedirs(directory)<\/p>\n\n

def create_crawled_list(crawled_list):
\n if not os.path.isfile(crawled_list):
\n write_file(crawled_list, '')<\/p>\n\n

def write_file(path, data):
\n with open(path, 'w') as f:
\n f.write(data)<\/p>\n\n

def append_to_file(path, data):
\n with open(path, 'a') as file:
\n file.write(data + '\\n')<\/p>\n\n

def delete_file_contents(path):
\n open(path, 'w').close()<\/p>\n\n

def file_to_set(file_name):
\n results = set()
\n with open(file_name, 'rt') as f:
\n for line in f:
\n results.add(line.replace('\\n', ''))
\n return results<\/p>\n\n

def set_to_file(links, file_name):
\n with open(file_name,"w") as f:
\n for l in sorted(links):
\n f.write(l+"\\n")<\/p>\n

**<\/span>NOTE<\/span>**<\/span> <\/span>\n1<\/span>.<\/span> <\/span>place<\/span> <\/span>all<\/span> <\/span>files<\/span> <\/span>in<\/span> <\/span>same<\/span> <\/span>folder<\/span>\n2<\/span>.<\/span> <\/span>Create<\/span> <\/span>a<\/span> <\/span>text<\/span> <\/span>file<\/span> <\/span>named<\/span> <\/span>app_list<\/span>.<\/span>txt<\/span> <\/span>**<\/span>where<\/span> <\/span>you<\/span> <\/span>have<\/span> <\/span>a<\/span> <\/span>list<\/span> <\/span>of<\/span> <\/span>app<\/span> <\/span>names<\/span>**<\/span>\n3<\/span>.<\/span> <\/span>This<\/span> <\/span>is<\/span> <\/span>a<\/span> <\/span>multithread<\/span> <\/span>application<\/span>,<\/span> <\/span>which<\/span> <\/span>means<\/span> <\/span>it<\/span> <\/span>can<\/span> <\/span>find<\/span> <\/span>multiple<\/span> <\/span>download<\/span> <\/span>links<\/span> <\/span>at<\/span> <\/span>the<\/span> <\/span>same<\/span> <\/span>time<\/span> <\/span>suitable<\/span> <\/span>for<\/span> <\/span>large<\/span> <\/span>list<\/span>\n4<\/span>.<\/span> <\/span>I<\/span> <\/span>have<\/span> <\/span>not<\/span> <\/span>made<\/span> <\/span>the<\/span> <\/span>download<\/span> <\/span>function<\/span>,<\/span> <\/span>you<\/span> <\/span>can<\/span> <\/span>use<\/span> <\/span>the<\/span> <\/span>function<\/span> <\/span>provided<\/span> <\/span>by<\/span> <\/span>cytebode<\/span>.<\/span>\n5<\/span>.<\/span> <\/span>All<\/span> <\/span>the<\/span> <\/span>download<\/span> <\/span>links<\/span> <\/span>the<\/span> <\/span>program<\/span> <\/span>finds<\/span> <\/span>is<\/span> <\/span>copied<\/span> <\/span>in<\/span> <\/span>a<\/span> <\/span>separate<\/span> <\/span>text<\/span> <\/span>file<\/span> <\/span>named<\/span> <\/span>crawled_list<\/span>.<\/span>txt<\/span> <\/span>**<\/span>the<\/span> <\/span>program<\/span> <\/span>automatically<\/span> <\/span>creates<\/span> <\/span>this<\/span> <\/span>file<\/span>**<\/span>\n<\/pre><\/div>
**Multi threaded** python27 program **main.py** ``` import threading import uuid from Queue import Queue from spider import Spider from general import * import time import urllib2 PROJECT_NAME = 'downloaded_directory' HOMEPAGE = 'https://apkpure.com' APP_LIST = 'app_list.txt' # get the first apk name from text file NUMBER_OF_THREADS = 4 queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, APP_LIST) MAX_REQ =50; x=1 threads = [] def create_spider(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) threads.append(t) t.daemon = True t.start() def work(): global x while True: if x >= MAX_REQ: x = 1 time.sleep(5) print "sleeping 5 sec" apk = queue.get() Spider.crawl_page(threading.current_thread().name, apk) queue.task_done() x +=1 def create_jobs(): for link in file_to_set(APP_LIST): queue.put(link) queue.join() crawl() def crawl(): queued_links = file_to_set(APP_LIST) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') create_jobs() create_spider() crawl() def download_apk(): with open('crawled_list.txt') as f: for line in f: : # each line here is the download link of the apk, you can use cytebode's download function to download the file download_apk() **() ``` **spider.py** ``` from bs4 import BeautifulSoup import requests from general import * class Spider: project_name = '' queue_file = '' crawled_file = '' search_page = '' queue = set() crawled = set() def __init__(self, project_name, search_page, app_list): Spider.project_name = project_name Spider.search_page = search_page Spider.queue_file = app_list Spider.crawled_file = 'crawled_list.txt' self.boot() #self.crawl_page('Pioneer spider', Spider.base_apk) @staticmethod def boot(): create_project_dir(Spider.project_name) create_crawled_list(Spider.crawled_file) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file) @staticmethod def crawl_page(thread_name, apk): if apk not in Spider.crawled: print(thread_name + ' now crawling ' + apk) print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) s = Spider.gather_download_link(Spider.search_page+'/search?q=' + apk) Spider.add_link_to_queue(s) Spider.queue.remove(apk) Spider.update_files() @staticmethod def gather_download_link(search_url): try: response = requests.get(search_url, stream=True) soup = BeautifulSoup(response.text, "html.parser") link_part = soup.findAll('a', attrs={'class': 'more-down'})[0]['href'] response_1 = requests.get(Spider.search_page+link_part+'/download?from=details', stream=True) soup_1 = BeautifulSoup(response_1.text, "html.parser") except Exception as e: print(str(e)) return set() return soup_1.findAll('a', attrs={'id': 'download_link'})[0]['href'] @staticmethod def add_link_to_queue(link): if link not in Spider.crawled: Spider.crawled.add(link) @staticmethod def update_files(): set_to_file(Spider.queue, Spider.queue_file) set_to_file(Spider.crawled, Spider.crawled_file) ``` **) ``` **general.py** ``` import os def create_project_dir(directory): if not os.path.exists(directory): print('Wait Creating directory ' + directory) os.makedirs(directory) def create_crawled_list(crawled_list): if not os.path.isfile(crawled_list): write_file(crawled_list, '') def write_file(path, data): with open(path, 'w') as f: f.write(data) def append_to_file(path, data): with open(path, 'a') as file: file.write(data + '\n') def delete_file_contents(path): open(path, 'w').close() def file_to_set(file_name): results = set() with open(file_name, 'rt') as f: for line in f: results.add(line.replace('\n', '')) return results def set_to_file(links, file_name): with open(file_name,"w") as f: for l in sorted(links): f.write(l+"\n") ``` **NOTE** 1. place all files in same folder 2. Create a text file named app_list.txt **where you have a list of app names** 3. This is a multithread application, which means it can find multiple download links at the same time suitable for large list 4. I have not made the download function, you can use the function provided by cytebode. 5. All the download links the program finds is copied in a separate text file named crawled_list.txt **the program automatically creates this file**
**Multi threaded** python27 program **main.py** ``` import threading import uuid from Queue import Queue from spider import Spider from general import * import time import urllib2 PROJECT_NAME = 'downloaded_directory' # the directory in which you want to download the apk HOMEPAGE = 'https://apkpure.com' APP_LIST = 'app_list.txt' NUMBER_OF_THREADS = 4 # no of threads you want queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, APP_LIST) MAX_REQ =50; # set a max no of request at a time x=1 def create_spider(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) threads.append(t) t.daemon = True t.start() def work(): global x while True: if x >= MAX_REQ: x = 1 time.sleep(5) print "sleeping 5 sec" apk = queue.get() Spider.crawl_page(threading.current_thread().name, apk) queue.task_done() x +=1 def create_jobs(): for link in file_to_set(APP_LIST): queue.put(link) queue.join() crawl() def crawl(): queued_links = file_to_set(APP_LIST) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') create_jobs() create_spider() crawl() # function to downlaod apk files # this function will read the crawled_list.txt file generated by the program which contains the download lisnk of the apk files fetched by the program def download_apk(): with open('crawled_list.txt') as f: for line in f: # each line here is the download link of the apk, you can use cytebode's download function to download the file download_apk() ``` **spider.py** ``` from bs4 import BeautifulSoup import requests from general import * class Spider: project_name = '' queue_file = '' crawled_file = '' search_page = '' queue = set() crawled = set() def __init__(self, project_name, search_page, app_list): Spider.project_name = project_name Spider.search_page = search_page Spider.queue_file = app_list Spider.crawled_file = 'https://apkpure.com' APP_LIST = 'app_list.txt' # get the first apk name from text file NUMBER_OF_THREADS = 4 queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, APP_LIST) MAX_REQ =50; x=1 threads = [] def create_spider(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) threads.append(t) t.daemon = True t.start() def work(): global x while True: if x >= MAX_REQ: x = 1 time.sleep(5) print "sleeping 5 sec" apk = queue.get() Spider.crawl_page(threading.current_thread().name, apk) queue.task_done() x +=1 def create_jobs(): for link in file_to_set(APP_LIST): queue.put(link) queue.join() crawl() def crawl(): queued_links = file_to_set(APP_LIST) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') create_jobs() create_spider() crawl() def download_apk(): with open('crawled_list.txt') as f: for line in f: # each line here is the download link of the apk, you can use cytebode's download function to download the file download_apk() ``` **spider.py** ``` from bs4 import BeautifulSoup import requests from general import * class Spider: project_name = '' queue_file = '' crawled_file = '' search_page = '' queue = set() crawled = set() def __init__(self, project_name, search_page, app_list): Spider.project_name = project_name Spider.search_page = search_page Spider.queue_file = app_list Spider.crawled_file = 'crawled_list.txt' self.boot() #self.crawl_page('Pioneer spider', Spider.base_apk) @() @staticmethod def boot(): create_project_dir(Spider.project_name) create_crawled_list(Spider.crawled_file) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file) @staticmethod def crawl_page(thread_name, apk): if apk not in Spider.crawled: print(thread_name + ' now crawling ' + apk) print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) s = Spider.gather_download_link(Spider.search_page+'/search?q=' + apk) Spider.add_link_to_queue(s) Spider.queue.remove(apk) Spider.update_files() @staticmethod def gather_download_link(search_url): try: response = requests.get(search_url, stream=True) soup = BeautifulSoup(response.text, "html.parser") link_part = soup.findAll('a', attrs={'class': 'more-down'})[0]['href'] response_1 = requests.get(Spider.search_page+link_part+'/download?from=details', stream=True) soup_1 = BeautifulSoup(response_1.text, "html.parser") except Exception as e: print(str(e)) return set() return soup_1.findAll('a', attrs={'id': 'download_link'})[0]['href'] @staticmethod def add_link_to_queue(link): if link not in Spider.crawled: Spider.crawled.add(link) @staticmethod def update_files(): set_to_file(Spider.queue, Spider.queue_file) set_to_file(Spider.crawled, Spider.crawled_file) ``` **general.py** ``` import os def create_project_dir(directory): if not os.path.exists(directory): print('Wait Creating directory ' + directory) os.makedirs(directory) def create_crawled_list(crawled_list): if not os.path.isfile(crawled_list): write_file(crawled_list, '') def write_file(path, data): with open(path, 'w') as f: f.write(data) def append_to_file(path, data): with open(path, 'a') as file: file.write(data + '\n') def delete_file_contents(path): open(path, 'w').close() def file_to_set(file_name): results = set() with open(file_name, 'rt') as f: for line in f: results.add(line.replace('\n', '')) return results def set_to_file(links, file_name): with open(file_name,"w") as f: for l in sorted(links): f.write(l+"\n") ``` **NOTE** 1. place all files in same folder 2. Create a text file named app_list.txt **where you have a list of app names** 3. This is a multithread application, which means it can find multiple download links at the same time suitable for large list 4. I have not made the download function, you can use the function provided by cytebode. 5. All the download links the program finds is copied in a separate text file named crawled_list.txt **the program automatically creates this file**
**Multi threaded** python27 program **main.py** ``` import threading import uuid from Queue import Queue from spider import Spider from general import * import time import urllib2 PROJECT_NAME = 'downloaded_directory' # the directory in which you want to download the apk HOMEPAGE = 'https://apkpure.com' APP_LIST = 'app_list.txt' NUMBER_OF_THREADS = 4 # no of threads you want queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, APP_LIST) MAX_REQ =50; # set a max no of request at a time x=1 def create_spider(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) #threads.append(t) threads.append(t) t.daemon = True t.start() def work(): global x while True: if x >= MAX_REQ: x = 1 time.sleep(5) print "sleeping 5 sec" apk = queue.get() Spider.crawl_page(threading.current_thread().name, apk) queue.task_done() x +=1 def create_jobs(): for link in file_to_set(APP_LIST): queue.put(link) queue.join() crawl() def crawl(): queued_links = file_to_set(APP_LIST) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') create_jobs() create_spider() crawl() # function to downlaod apk files # this function will read the crawled_list.txt file generated by the program which contains the download lisnk of the apk files fetched by the program def download_apk(): with open('crawled_list.txt') as f: for line in f: # each line here is the download link of the apk, you can use cytebode's download function to download the file download_apk() ``` **spider.py** ``` from bs4 import BeautifulSoup import requests from general import * class Spider: project_name = '' queue_file = '' crawled_file = '' search_page = '' queue = set() crawled = set() def __init__(self, project_name, search_page, app_list): Spider.project_name = project_name Spider.search_page = search_page Spider.queue_file = app_list Spider.crawled_file = 'crawled_list.txt' self.boot() @staticmethod def boot(): create_project_dir(Spider.project_name) create_crawled_list(Spider.crawled_file) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file) @staticmethod def crawl_page(thread_name, apk): if apk not in Spider.crawled: print(thread_name + ' now crawling ' + apk) print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) s = Spider.gather_download_link(Spider.search_page+'/search?q=' + apk) Spider.add_link_to_queue(s) Spider.queue.remove(apk) Spider.update_files() @staticmethod def gather_download_link(search_url): try: response = requests.get(search_url, stream=True) soup = BeautifulSoup(response.text, "html.parser") link_part = soup.findAll('a', attrs={'class': 'more-down'})[0]['href'] response_1 = requests.get(Spider.search_page+link_part+'/download?from=details', stream=True) soup_1 = BeautifulSoup(response_1.text, "html.parser") except Exception as e: print(str(e)) return set() return soup_1.findAll('a', attrs={'id': 'download_link'})[0]['href'] @staticmethod def add_link_to_queue(link): if link not in Spider.crawled: Spider.crawled.add(link) @staticmethod def update_files(): set_to_file(Spider.queue, Spider.queue_file) set_to_file(Spider.crawled, Spider.crawled_file) ``` **general.py** ``` import os def create_project_dir(directory): if not os.path.exists(directory): print('Wait Creating directory ' + directory) os.makedirs(directory) def create_crawled_list(crawled_list): if not os.path.isfile(crawled_list): write_file(crawled_list, '') def write_file(path, data): with open(path, 'w') as f: f.write(data) def append_to_file(path, data): with open(path, 'a') as file: file.write(data + '\n') def delete_file_contents(path): open(path, 'w').close() def file_to_set(file_name): results = set() with open(file_name, 'rt') as f: for line in f: results.add(line.replace('\n', '')) return results def set_to_file(links, file_name): with open(file_name,"w") as f: for l in sorted(links): f.write(l+"\n") ``` **NOTE** 1. place all files in same folder 2. Create a text file named app_list.txt **where you have a list of app names** 3. This is a multithread application, which means it can find multiple download links at the same time suitable for large list 4. I have not made the download function, you can use the function provided by cytebode. 5. All the download links the program finds is copied in a separate text file named crawled_list.txt **the program automatically creates this file**
**Multi threaded** python27 program **main.py** ``` import threading import uuid from Queue import Queue from spider import Spider from general import * import time import urllib2 PROJECT_NAME = 'downloaded_directory' # the directory in which you want to download the apk HOMEPAGE = 'https://apkpure.com' APP_LIST = 'app_list.txt' NUMBER_OF_THREADS = 4 # no of threads you want queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, APP_LIST) MAX_REQ =50; # set a max no of request at a time x=1 def create_spider(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) #threads.append(t) t.daemon = True t.start() def work(): global x while True: if x >= MAX_REQ: x = 1 time.sleep(5) print "sleeping 5 sec" apk = queue.get() Spider.crawl_page(threading.current_thread().name, apk) queue.task_done() x +=1 def create_jobs(): for link in file_to_set(APP_LIST): queue.put(link) queue.join() crawl() def crawl(): queued_links = file_to_set(APP_LIST) if len(queued_links) > 0: print(str(len(queued_links)) + ' links in the queue') create_jobs() create_spider() crawl() # function to downlaod apk files # this function will read the crawled_list.txt file generated by the program which contains the download lisnk of the apk files fetched by the program def download_apk(): with open('crawled_list.txt') as f: for line in f: # each line here is the download link of the apk, you can use cytebode's download function to download the file download_apk() ``` **spider.py** ``` from bs4 import BeautifulSoup import requests from general import * class Spider: project_name = '' queue_file = '' crawled_file = '' search_page = '' queue = set() crawled = set() def __init__(self, project_name, search_page, app_list): Spider.project_name = project_name Spider.search_page = search_page Spider.queue_file = app_list Spider.crawled_file = 'crawled_list.txt' self.boot() @staticmethod def boot(): create_project_dir(Spider.project_name) create_crawled_list(Spider.crawled_file) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file) @staticmethod def crawl_page(thread_name, apk): if apk not in Spider.crawled: print(thread_name + ' now crawling ' + apk + '\n') print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) s = Spider.gather_download_link(Spider.search_page+'/search?q=' + apk) print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) s = Spider.gather_download_link(Spider.search_page+'/search?q=' + apk) Spider.add_link_to_queue(s) Spider.queue.remove(apk) Spider.update_files() @staticmethod def gather_download_link(search_url): try: response = requests.get(search_url, stream=True) soup = BeautifulSoup(response.text, "html.parser") list = soup.findAll('a', attrs={'class': 'more-down'}) if list: link_part = soup.findAll('a', attrs={'class': 'more-down'})[0]['href'] response_1 = requests.get(Spider.search_page+link_part+'/download?from=details', stream=True) soup_1 = BeautifulSoup(response_1.text, "html.parser") except Exception as e: print(str(e)) return set() return soup_1.findAll('a', attrs={'id': 'download_link'})[0]['href'] @staticmethod def add_link_to_queue(link): if link not in Spider.crawled: Spider.crawled.add(link) @staticmethod def update_files(): set_to_file(Spider.queue, Spider.queue_file) set_to_file(Spider.crawled, Spider.crawled_file) ``` **general.py** ``` import os def create_project_dir(directory): if not os.path.exists(directory): print('Wait Creating directory ' + directory) os.makedirs(directory) def create_crawled_list(crawled_list): if not os.path.isfile(crawled_list): write_file(crawled_list, '') def write_file(path, data): with open(path, 'w') as f: f.write(data) def append_to_file(path, data): with open(path, 'a') as file: file.write(data + '\n') def delete_file_contents(path): open(path, 'w').close() def file_to_set(file_name): results = set() with open(file_name, 'rt') as f: for line in f: results.add(line.replace('\n', '')) return results def set_to_file(links, file_name): with open(file_name,"w") as f: for l in sorted(links): f.write(l+"\n") ``` **NOTE** 1. place all files in same folder 2. Create a text file named app_list.txt **where you have a list[0]['href'] response_1 = requests.get(Spider.search_page+link_part+'/download?from=details', stream=True) soup_1 = BeautifulSoup(response_1.text, "html.parser") inner_list = soup_1.findAll('a', attrs={'id': 'download_link'}) if inner_list: return inner_list[0]['href'] except Exception as e: print(str(e)) return set() @staticmethod def add_link_to_queue(link): if link not in Spider.crawled: Spider.crawled.add(link) @staticmethod def update_files(): set_to_file(Spider.queue, Spider.queue_file) set_to_file(Spider.crawled, Spider.crawled_file) ``` **general.py** ``` import os def create_project_dir(directory): if not os.path.exists(directory): print('Wait Creating directory ' + directory) os.makedirs(directory) def create_crawled_list(crawled_list): if not os.path.isfile(crawled_list): write_file(crawled_list, '') def write_file(path, data): with open(path, 'w') as f: f.write(data) def append_to_file(path, data): with open(path, 'a') as file: file.write(data + '\n') def delete_file_contents(path): open(path, 'w').close() def file_to_set(file_name): results = set() with open(file_name, 'rt') as f: for line in f: results.add(line.replace('\n', '')) return results def set_to_file(links, file_name): with open(file_name,"w") as f: for l in sorted(links): f.write(l+"\n") ``` **NOTE** 1. place all files in same folder 2. Create a text file named app_list.txt **where you have a list of app names** 3. This is a multithread application, which means it can find multiple download links at the same time suitable for large list 4. I have not made the download function, you can use the function provided by cytebode. 5. All the download links the program finds is copied in a separate text file named crawled_list.txt **the program automatically creates this file**

User: Codeword

Question: Mass download list of APKs by Package Names

Back to question