Reformat code
This commit is contained in:
parent
0f717b987d
commit
6e00f70184
32
db.py
32
db.py
|
@ -1,12 +1,13 @@
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
import manticoresearch
|
import manticoresearch
|
||||||
|
import ujson
|
||||||
from manticoresearch.rest import ApiException
|
from manticoresearch.rest import ApiException
|
||||||
|
from numpy import array_split
|
||||||
from redis import StrictRedis
|
from redis import StrictRedis
|
||||||
|
|
||||||
import util
|
import util
|
||||||
from schemas.mc_s import schema
|
from schemas.mc_s import schema
|
||||||
import ujson
|
|
||||||
from numpy import array_split
|
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
|
configuration = manticoresearch.Configuration(host="http://monolith-db-1:9308")
|
||||||
api_client = manticoresearch.ApiClient(configuration)
|
api_client = manticoresearch.ApiClient(configuration)
|
||||||
|
@ -15,6 +16,7 @@ api_instance = manticoresearch.IndexApi(api_client)
|
||||||
log = util.get_logger("db")
|
log = util.get_logger("db")
|
||||||
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)
|
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)
|
||||||
|
|
||||||
|
|
||||||
def store_message(msg):
|
def store_message(msg):
|
||||||
"""
|
"""
|
||||||
Store a message into Manticore
|
Store a message into Manticore
|
||||||
|
@ -30,14 +32,7 @@ def store_message(msg):
|
||||||
if schema[key].startswith("string"):
|
if schema[key].startswith("string"):
|
||||||
msg[key] = str(value)
|
msg[key] = str(value)
|
||||||
|
|
||||||
body = [
|
body = [{"insert": {"index": "main", "doc": msg}}]
|
||||||
{
|
|
||||||
"insert": {
|
|
||||||
"index": "main",
|
|
||||||
"doc": msg
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
body_post = ""
|
body_post = ""
|
||||||
for item in body:
|
for item in body:
|
||||||
body_post += ujson.dumps(item)
|
body_post += ujson.dumps(item)
|
||||||
|
@ -46,11 +41,12 @@ def store_message(msg):
|
||||||
# print(body_post)
|
# print(body_post)
|
||||||
try:
|
try:
|
||||||
# Bulk index operations
|
# Bulk index operations
|
||||||
api_response = api_instance.bulk(body_post, async_req=True)
|
api_instance.bulk(body_post, async_req=True)
|
||||||
# print(api_response)
|
# print(api_response)
|
||||||
except ApiException as e:
|
except ApiException as e:
|
||||||
print("Exception when calling IndexApi->bulk: %s\n" % e)
|
print("Exception when calling IndexApi->bulk: %s\n" % e)
|
||||||
|
|
||||||
|
|
||||||
def store_message_bulk(data):
|
def store_message_bulk(data):
|
||||||
"""
|
"""
|
||||||
Store a message into Manticore
|
Store a message into Manticore
|
||||||
|
@ -71,12 +67,7 @@ def store_message_bulk(data):
|
||||||
if schema[key].startswith("string"):
|
if schema[key].startswith("string"):
|
||||||
msg[key] = str(value)
|
msg[key] = str(value)
|
||||||
|
|
||||||
body = {
|
body = {"insert": {"index": "main", "doc": msg}}
|
||||||
"insert": {
|
|
||||||
"index": "main",
|
|
||||||
"doc": msg
|
|
||||||
}
|
|
||||||
}
|
|
||||||
total.append(body)
|
total.append(body)
|
||||||
|
|
||||||
body_post = ""
|
body_post = ""
|
||||||
|
@ -87,7 +78,7 @@ def store_message_bulk(data):
|
||||||
# print(body_post)
|
# print(body_post)
|
||||||
try:
|
try:
|
||||||
# Bulk index operations
|
# Bulk index operations
|
||||||
api_response = api_instance.bulk(body_post, async_req=True)
|
api_instance.bulk(body_post, async_req=True)
|
||||||
# print(api_response)
|
# print(api_response)
|
||||||
except ApiException as e:
|
except ApiException as e:
|
||||||
print("Exception when calling IndexApi->bulk: %s\n" % e)
|
print("Exception when calling IndexApi->bulk: %s\n" % e)
|
||||||
|
@ -95,9 +86,11 @@ def store_message_bulk(data):
|
||||||
|
|
||||||
print("BULK FINISH")
|
print("BULK FINISH")
|
||||||
|
|
||||||
|
|
||||||
def update_schema():
|
def update_schema():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def create_index(api_client):
|
def create_index(api_client):
|
||||||
util_instance = manticoresearch.UtilsApi(api_client)
|
util_instance = manticoresearch.UtilsApi(api_client)
|
||||||
schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])
|
schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])
|
||||||
|
@ -107,6 +100,5 @@ def create_index(api_client):
|
||||||
util_instance.sql(create_query)
|
util_instance.sql(create_query)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
create_index(api_client)
|
create_index(api_client)
|
||||||
update_schema()
|
update_schema()
|
||||||
|
|
|
@ -8,6 +8,7 @@ from sources.dis import DiscordClient
|
||||||
# For development
|
# For development
|
||||||
if not getenv("DISCORD_TOKEN", None):
|
if not getenv("DISCORD_TOKEN", None):
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
log = util.get_logger("monolith")
|
log = util.get_logger("monolith")
|
||||||
|
|
|
@ -1,199 +1,132 @@
|
||||||
schema = {
|
schema = {
|
||||||
"id": "bigint",
|
"id": "bigint",
|
||||||
|
|
||||||
# 1
|
# 1
|
||||||
"archived": "int",
|
"archived": "int",
|
||||||
|
|
||||||
# 1662150538
|
# 1662150538
|
||||||
"archived_on": "string indexed attribute",
|
"archived_on": "string indexed attribute",
|
||||||
|
|
||||||
# CF
|
# CF
|
||||||
"board_flag": "string indexed attribute",
|
"board_flag": "string indexed attribute",
|
||||||
|
|
||||||
# true, false
|
# true, false
|
||||||
"bot": "bool",
|
"bot": "bool",
|
||||||
|
|
||||||
# 0
|
# 0
|
||||||
"bumplimit": "int",
|
"bumplimit": "int",
|
||||||
|
|
||||||
# mod
|
# mod
|
||||||
"capcode": "string indexed attribute",
|
"capcode": "string indexed attribute",
|
||||||
|
|
||||||
# 393598265, #main, Rust Programmer's Club
|
# 393598265, #main, Rust Programmer's Club
|
||||||
"channel": "text",
|
"channel": "text",
|
||||||
|
|
||||||
# Miscellaneous
|
# Miscellaneous
|
||||||
"channel_category": "text",
|
"channel_category": "text",
|
||||||
|
|
||||||
# 360581491907887100
|
# 360581491907887100
|
||||||
"channel_category_id": "string indexed attribute",
|
"channel_category_id": "string indexed attribute",
|
||||||
|
|
||||||
# true, false
|
# true, false
|
||||||
"channel_category_nsfw": "bool",
|
"channel_category_nsfw": "bool",
|
||||||
|
|
||||||
# 734229101216530600
|
# 734229101216530600
|
||||||
"channel_id": "string indexed attribute",
|
"channel_id": "string indexed attribute",
|
||||||
|
|
||||||
# true, false
|
# true, false
|
||||||
"channel_nsfw": "bool",
|
"channel_nsfw": "bool",
|
||||||
|
|
||||||
# 1
|
# 1
|
||||||
"closed": "int",
|
"closed": "int",
|
||||||
|
|
||||||
# GB
|
# GB
|
||||||
"country": "string indexed attribute",
|
"country": "string indexed attribute",
|
||||||
|
|
||||||
# United Kingdom
|
# United Kingdom
|
||||||
"country_name": "text",
|
"country_name": "text",
|
||||||
|
|
||||||
# 5
|
# 5
|
||||||
"file_custom_spoiler": "int",
|
"file_custom_spoiler": "int",
|
||||||
|
|
||||||
# 1
|
# 1
|
||||||
"file_deleted": "int",
|
"file_deleted": "int",
|
||||||
|
|
||||||
# .jpg
|
# .jpg
|
||||||
"file_ext": "string indexed attribute",
|
"file_ext": "string indexed attribute",
|
||||||
|
|
||||||
# 1024
|
# 1024
|
||||||
"file_h": "int",
|
"file_h": "int",
|
||||||
|
|
||||||
# 1
|
# 1
|
||||||
"file_m_img": "int",
|
"file_m_img": "int",
|
||||||
|
|
||||||
# tlArbrZDj7kbheSKPyDU0w==
|
# tlArbrZDj7kbheSKPyDU0w==
|
||||||
"file_md5": "string indexed attribute",
|
"file_md5": "string indexed attribute",
|
||||||
|
|
||||||
# 88967
|
# 88967
|
||||||
"file_size": "int",
|
"file_size": "int",
|
||||||
|
|
||||||
# 1
|
# 1
|
||||||
"file_spoiler": "int",
|
"file_spoiler": "int",
|
||||||
|
|
||||||
# 1662149436322819
|
# 1662149436322819
|
||||||
"file_tim": "string indexed attribute",
|
"file_tim": "string indexed attribute",
|
||||||
|
|
||||||
# 250
|
# 250
|
||||||
"file_tn_h": "int",
|
"file_tn_h": "int",
|
||||||
|
|
||||||
# 241
|
# 241
|
||||||
"file_tn_w": "int",
|
"file_tn_w": "int",
|
||||||
|
|
||||||
# 1080
|
# 1080
|
||||||
"file_w": "int",
|
"file_w": "int",
|
||||||
|
|
||||||
# 6E646BED-297E-4B4F-9082-31EDADC49472
|
# 6E646BED-297E-4B4F-9082-31EDADC49472
|
||||||
"filename": "text",
|
"filename": "text",
|
||||||
|
|
||||||
# Confederate
|
# Confederate
|
||||||
"flag_name": "string indexed attribute",
|
"flag_name": "string indexed attribute",
|
||||||
|
|
||||||
|
|
||||||
"guild": "text", # LEGACY -> channel
|
"guild": "text", # LEGACY -> channel
|
||||||
"guild_id": "string indexed attribute", # LEGACY -> channel_id
|
"guild_id": "string indexed attribute", # LEGACY -> channel_id
|
||||||
|
|
||||||
# 36180
|
# 36180
|
||||||
"guild_member_count": "int", # ? -> channel_member_count
|
"guild_member_count": "int", # ? -> channel_member_count
|
||||||
|
|
||||||
# 9f7b2e6a0e9b
|
# 9f7b2e6a0e9b
|
||||||
"host": "text",
|
"host": "text",
|
||||||
|
|
||||||
# 2447746
|
# 2447746
|
||||||
"id_reply": "string indexed attribute", # resto
|
"id_reply": "string indexed attribute", # resto
|
||||||
|
|
||||||
# "522, trans rights shill", myname
|
# "522, trans rights shill", myname
|
||||||
"ident": "text",
|
"ident": "text",
|
||||||
|
|
||||||
# 0
|
# 0
|
||||||
"imagelimit": "int",
|
"imagelimit": "int",
|
||||||
|
|
||||||
# 0
|
# 0
|
||||||
"images": "int",
|
"images": "int",
|
||||||
|
|
||||||
# 0
|
# 0
|
||||||
"mode": "string indexed attribute",
|
"mode": "string indexed attribute",
|
||||||
|
|
||||||
# b0n3
|
# b0n3
|
||||||
"modearg": "string indexed attribute",
|
"modearg": "string indexed attribute",
|
||||||
|
|
||||||
# The quick brown fox jumped over the lazy dog
|
# The quick brown fox jumped over the lazy dog
|
||||||
"msg": "text",
|
"msg": "text",
|
||||||
|
|
||||||
# 393605030
|
# 393605030
|
||||||
"msg_id": "string indexed attribute",
|
"msg_id": "string indexed attribute",
|
||||||
|
|
||||||
# pol
|
# pol
|
||||||
"net": "text",
|
"net": "text",
|
||||||
|
|
||||||
# 273534239310479360
|
# 273534239310479360
|
||||||
"net_id": "string indexed attribute",
|
"net_id": "string indexed attribute",
|
||||||
|
|
||||||
# André de Santa Cruz, santa
|
# André de Santa Cruz, santa
|
||||||
"nick": "text",
|
"nick": "text",
|
||||||
|
|
||||||
# 773802568324350000
|
# 773802568324350000
|
||||||
"nick_id": "string indexed attribute",
|
"nick_id": "string indexed attribute",
|
||||||
|
|
||||||
# 1, 2, 3, 4, 5, 6, ...
|
# 1, 2, 3, 4, 5, 6, ...
|
||||||
"num": "int",
|
"num": "int",
|
||||||
|
|
||||||
# 12
|
# 12
|
||||||
"replies": "int",
|
"replies": "int",
|
||||||
|
|
||||||
# redacted-hate-thread
|
# redacted-hate-thread
|
||||||
"semantic_url": "string indexed attribute",
|
"semantic_url": "string indexed attribute",
|
||||||
|
|
||||||
# -1 -> 1 as float
|
# -1 -> 1 as float
|
||||||
"sentiment": "float",
|
"sentiment": "float",
|
||||||
|
|
||||||
# 2022
|
# 2022
|
||||||
"since4pass": "int",
|
"since4pass": "int",
|
||||||
|
|
||||||
# 4ch, irc, dis
|
# 4ch, irc, dis
|
||||||
"src": "string indexed attribute",
|
"src": "string indexed attribute",
|
||||||
|
|
||||||
# true, false
|
# true, false
|
||||||
"status": "bool",
|
"status": "bool",
|
||||||
|
|
||||||
# 1
|
# 1
|
||||||
"sticky": "int",
|
"sticky": "int",
|
||||||
|
|
||||||
# 1000
|
# 1000
|
||||||
"sticky_cap": "int",
|
"sticky_cap": "int",
|
||||||
|
|
||||||
# Redacted Hate Thread, Gorbachev is dead
|
# Redacted Hate Thread, Gorbachev is dead
|
||||||
"sub": "string indexed attribute",
|
"sub": "string indexed attribute",
|
||||||
|
|
||||||
# Loop
|
# Loop
|
||||||
"tag": "string indexed attribute",
|
"tag": "string indexed attribute",
|
||||||
|
|
||||||
# 100
|
# 100
|
||||||
"tail_size": "int",
|
"tail_size": "int",
|
||||||
|
|
||||||
"time": "timestamp", # LEGACY -> ts
|
"time": "timestamp", # LEGACY -> ts
|
||||||
|
|
||||||
"tokens": "text", # ???
|
"tokens": "text", # ???
|
||||||
|
|
||||||
# 2022-09-02T16:10:36
|
# 2022-09-02T16:10:36
|
||||||
"ts": "timestamp",
|
"ts": "timestamp",
|
||||||
|
|
||||||
# msg, notice, update, who
|
# msg, notice, update, who
|
||||||
"type": "string indexed attribute",
|
"type": "string indexed attribute",
|
||||||
|
|
||||||
# 10
|
# 10
|
||||||
"unique_ips": "int",
|
"unique_ips": "int",
|
||||||
|
|
||||||
# 1662149436
|
# 1662149436
|
||||||
"unix_time": "string indexed attribute",
|
"unix_time": "string indexed attribute",
|
||||||
|
|
||||||
# Anonymous
|
# Anonymous
|
||||||
"user": "text",
|
"user": "text",
|
||||||
|
|
||||||
"user_id": "string indexed attribute", # LEGACY -> nick_id
|
"user_id": "string indexed attribute", # LEGACY -> nick_id
|
||||||
|
|
||||||
# 1, 2
|
# 1, 2
|
||||||
"version_sentiment": "int",
|
"version_sentiment": "int",
|
||||||
|
|
||||||
# 1, 2
|
# 1, 2
|
||||||
"version_tokens": "int",
|
"version_tokens": "int",
|
||||||
}
|
}
|
|
@ -1,23 +1,22 @@
|
||||||
# Python modules can't start with a number...
|
# Python modules can't start with a number...
|
||||||
import ujson
|
import asyncio
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
import ujson
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from siphashc import siphash
|
from siphashc import siphash
|
||||||
|
|
||||||
import db
|
import db
|
||||||
import util
|
import util
|
||||||
from schemas.ch4_s import ATTRMAP
|
from schemas.ch4_s import ATTRMAP
|
||||||
import aiohttp
|
|
||||||
import asyncio
|
|
||||||
from numpy import array_split
|
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
|
||||||
p = ProcessPoolExecutor(10)
|
p = ProcessPoolExecutor(10)
|
||||||
|
|
||||||
|
|
||||||
class Chan4(object):
|
class Chan4(object):
|
||||||
"""
|
"""
|
||||||
4chan indexer, crawler and ingester.
|
4chan indexer, crawler and ingester.
|
||||||
|
@ -82,7 +81,10 @@ class Chan4(object):
|
||||||
await self.get_thread_lists(self.boards)
|
await self.get_thread_lists(self.boards)
|
||||||
|
|
||||||
async def get_threads_content(self, thread_list):
|
async def get_threads_content(self, thread_list):
|
||||||
thread_urls = {(board, thread): f"{board}/thread/{thread}.json" for board, thread in thread_list}
|
thread_urls = {
|
||||||
|
(board, thread): f"{board}/thread/{thread}.json"
|
||||||
|
for board, thread in thread_list
|
||||||
|
}
|
||||||
self.log.debug(f"Getting information for threads: {thread_urls}")
|
self.log.debug(f"Getting information for threads: {thread_urls}")
|
||||||
responses = await self.api_call(thread_urls)
|
responses = await self.api_call(thread_urls)
|
||||||
self.log.debug(f"Got information for threads: {thread_urls}")
|
self.log.debug(f"Got information for threads: {thread_urls}")
|
||||||
|
@ -167,26 +169,20 @@ class Chan4(object):
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
try:
|
try:
|
||||||
return (mapped, await response.json())
|
return (mapped, await response.json())
|
||||||
except:
|
except: # noqa
|
||||||
print("FETCH ERROR")
|
print("FETCH ERROR")
|
||||||
return (mapped, None)
|
return (mapped, None)
|
||||||
|
|
||||||
|
|
||||||
async def bound_fetch(self, sem, url, session, mapped):
|
async def bound_fetch(self, sem, url, session, mapped):
|
||||||
# Getter function with semaphore.
|
# Getter function with semaphore.
|
||||||
async with sem:
|
async with sem:
|
||||||
try:
|
try:
|
||||||
return await self.fetch(url, session, mapped)
|
return await self.fetch(url, session, mapped)
|
||||||
except:
|
except: # noqa
|
||||||
print("BOUND ERROR")
|
print("BOUND ERROR")
|
||||||
return (mapped, None)
|
return (mapped, None)
|
||||||
|
|
||||||
async def api_call(self, methods={}):
|
async def api_call(self, methods={}):
|
||||||
headers = {
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
tasks = []
|
tasks = []
|
||||||
sem = asyncio.Semaphore(100)
|
sem = asyncio.Semaphore(100)
|
||||||
connector = aiohttp.TCPConnector(limit=None)
|
connector = aiohttp.TCPConnector(limit=None)
|
||||||
|
@ -199,4 +195,3 @@ class Chan4(object):
|
||||||
tasks.append(task)
|
tasks.append(task)
|
||||||
responses = await asyncio.gather(*tasks)
|
responses = await asyncio.gather(*tasks)
|
||||||
return responses
|
return responses
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import discord
|
||||||
|
|
||||||
import db
|
import db
|
||||||
import util
|
import util
|
||||||
|
|
||||||
from schemas.dis_s import ATTRMAP
|
from schemas.dis_s import ATTRMAP
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue