Implement indexing into Apache Druid #1

Closed
m wants to merge 263 commits from druid into master
2 changed files with 6 additions and 6 deletions
Showing only changes of commit 297bbbe035 - Show all commits

View File

@ -58,8 +58,8 @@ schema = {
"filename": "text", "filename": "text",
# Confederate # Confederate
"flag_name": "string indexed attribute", "flag_name": "string indexed attribute",
"guild": "text", # LEGACY -> channel #"guild": "text", # LEGACY -> channel
"guild_id": "string indexed attribute", # LEGACY -> channel_id #"guild_id": "string indexed attribute", # LEGACY -> channel_id
# 36180 # 36180
"guild_member_count": "int", # ? -> channel_member_count "guild_member_count": "int", # ? -> channel_member_count
# 9f7b2e6a0e9b # 9f7b2e6a0e9b
@ -112,7 +112,7 @@ schema = {
"tag": "string indexed attribute", "tag": "string indexed attribute",
# 100 # 100
"tail_size": "int", "tail_size": "int",
"time": "timestamp", # LEGACY -> ts #"time": "timestamp", # LEGACY -> ts
"tokens": "text", # ??? "tokens": "text", # ???
# 2022-09-02T16:10:36 # 2022-09-02T16:10:36
"ts": "timestamp", "ts": "timestamp",
@ -124,7 +124,7 @@ schema = {
"unix_time": "string indexed attribute", "unix_time": "string indexed attribute",
# Anonymous # Anonymous
"user": "text", "user": "text",
"user_id": "string indexed attribute", # LEGACY -> nick_id #"user_id": "string indexed attribute", # LEGACY -> nick_id
# 1, 2 # 1, 2
"version_sentiment": "int", "version_sentiment": "int",
# 1, 2 # 1, 2

View File

@ -19,7 +19,7 @@ from schemas.ch4_s import ATTRMAP
# CONFIGURATION # # CONFIGURATION #
# Number of 4chan threads to request at once # Number of 4chan threads to request at once
THREADS_CONCURRENT = 1000 THREADS_CONCURRENT = 100
# Seconds to wait between every THREADS_CONCURRENT requests # Seconds to wait between every THREADS_CONCURRENT requests
THREADS_DELAY = 0.1 THREADS_DELAY = 0.1
@ -31,7 +31,7 @@ CRAWL_DELAY = 5
THREADS_SEMAPHORE = 100 THREADS_SEMAPHORE = 100
# Maximum number of CPU threads to use for post processing # Maximum number of CPU threads to use for post processing
CPU_THREADS = 2 CPU_THREADS = 1
# CONFIGURATION END # # CONFIGURATION END #