Compare commits

308 Commits
master ... prod

Author SHA1 Message Date
808ed18b74 Switch quickstart setting to nano 2022-10-04 20:37:02 +01:00
34e589aa9c Set Superset env file relative to docker directory 2022-10-04 20:30:14 +01:00
cc6340acab Add persistent Redis data store and copy over Druid config to production 2022-10-04 20:26:58 +01:00
7b73229d5a Add Apache Superset and fix Druid resource usage 2022-10-04 20:17:04 +01:00
35ba2cc947 Add postgres config to Metabase 2022-10-02 14:29:40 +01:00
817bfd8835 Time stuff and switch to gensim for tokenisation 2022-10-01 14:46:45 +01:00
40cf0c6430 Remove commented debug code 2022-09-30 07:22:22 +01:00
63081f68b7 Use only one Redis key for the queue to make chunk size more precise for thread allocation 2022-09-30 07:22:22 +01:00
5992498493 Remove ujson 2022-09-30 15:30:34 +01:00
328db4a6da Reformat 2022-09-30 15:23:00 +01:00
c5c834da82 Add config file to Turnilo 2022-09-27 08:30:28 +01:00
a8dbabd85e Implement uvloop 2022-09-23 07:20:30 +01:00
56b5c85fac Print Ingest settings on start 2022-09-23 08:32:29 +01:00
fc7450c33a Make debug output cleaner 2022-09-22 17:39:29 +01:00
0e9a016e2a Fix indexer options 2022-09-22 17:39:18 +01:00
763501d1ee Fix Java variable in indexer parameters 2022-09-22 08:41:59 +01:00
40a215e6ec Decrease memory requirements further and switch Kafka image 2022-09-21 21:11:13 +01:00
7abf9a00cb Set Kafka max heap size 2022-09-21 20:26:05 +01:00
bd3f1ecd53 Set max memory for Metabase 2022-09-21 14:39:11 +01:00
64ebcedd76 Remove debugging code and fix regex substitution 2022-09-21 12:48:54 +01:00
3d293daad3 Change dev container names 2022-09-21 12:09:18 +01:00
00890860c0 Change prod container names 2022-09-21 12:08:29 +01:00
b0efaeef90 Remove prod compose comment 2022-09-21 12:04:54 +01:00
d6d19625f3 Remove commented code for debugging 2022-09-21 10:02:05 +01:00
cf4aa45663 Normalise fields in processing and remove invalid characters 2022-09-21 10:01:12 +01:00
48e4c07959 Make production volumes point to external storage 2022-09-21 10:00:48 +01:00
027c43b60a Don't muddle up the topics when sending Kafka batches 2022-09-20 23:03:02 +01:00
e0803d4934 Document new PROCESS_THREADS setting in example file 2022-09-20 22:43:04 +01:00
6de17063a2 Make CPU threads configurable 2022-09-20 22:29:13 +01:00
2c5133a546 Make performance settings configurable 2022-09-20 22:22:13 +01:00
24929a5fbb Set memory size to 2.5GB 2022-09-08 07:20:30 +01:00
f336d96268 Update DirectMemorySize to be 1.5GB 2022-09-19 21:51:07 +01:00
315e477916 Make MaxDirectMemory 0.5*cores 2022-09-19 19:15:57 +01:00
006677819d Make max memory size 512m 2022-09-19 19:10:33 +01:00
93a0be98ce Further decrease Druid memory requirements 2022-09-19 17:07:15 +01:00
14322f5090 Bump production Kafka healthcheck timeout 2022-09-19 11:18:52 +01:00
d94da5ac5c Decrease production Druid max memory size 2022-09-19 10:51:34 +01:00
a1382ee46d Increase Kafka retries 2022-09-19 10:48:29 +01:00
5e6b962ea8 Change Metabase port 2022-09-18 13:15:10 +01:00
e8dd847b36 Add docker environment file 2022-09-18 13:05:08 +01:00
d68bcfaebd Update production compose 2022-09-18 13:04:08 +01:00
ebfa06e8d6 Reformat comment 2022-09-18 13:02:06 +01:00
3ed382ec13 Implement restricted sources 2022-09-18 13:01:19 +01:00
dab5e81715 Fix merge conflict 2022-09-16 17:45:24 +01:00
143f2a0bf0 Implement sentiment/NLP annotation and optimise processing 2022-09-16 17:09:49 +01:00
4ea77ac543 Properly process Redis buffered messages and ingest into Kafka 2022-09-14 18:32:32 +01:00
fec0d379a6 Ingest into Kafka and queue messages better 2022-09-13 22:17:46 +01:00
3c2adfc16e Implement Apache Druid/Kafka and Metabase 2022-09-13 22:17:32 +01:00
4c6fe87b88 Switch to latest image for dev docker-compose 2022-09-13 09:20:43 +01:00
79a430be04 Begin implementing Apache Druid 2022-09-08 07:20:30 +01:00
baea6aebeb Use stable after all 2022-09-08 07:20:30 +01:00
eaecc5cdbe Switch production image back to dev 2022-09-08 07:20:30 +01:00
764e36ef14 Lower memory requirements to prevent crashes 2022-09-08 07:20:30 +01:00
50a873dbba Set dev image back to the default 2022-09-12 08:43:18 +01:00
21182629b4 Treat text fields as string and try beta Kibana image 2022-09-12 08:27:13 +01:00
dfd71b6c64 Add Mysql port to ports instead of expose 2022-09-10 13:20:06 +01:00
1b0817b047 Expose the Mysql port 2022-09-10 13:16:19 +01:00
0ba4929294 Use dev image of manticore 2022-09-10 12:03:45 +01:00
caded433b7 Remove indexer block to attempt to prevent Manticore DB crash 2022-09-08 07:20:30 +01:00
bf802d7fdf Reformat 2022-09-07 07:20:30 +01:00
89328a827a Raise open files limit for Redis 2022-09-07 07:20:30 +01:00
32249a1d99 Add 4chan update message type to main types 2022-09-07 07:20:30 +01:00
cdd12cd082 Implement threshold writing to Redis and manticore ingesting from Redis 2022-09-07 07:20:30 +01:00
137299fe9e Add config directories to gitignore 2022-09-08 09:45:18 +01:00
2aedcf77a0 Add aioredis 2022-09-08 09:44:27 +01:00
49784dfbe5 Implement ingesting to Redis from Threshold 2022-09-07 07:20:30 +01:00
a6b5348224 Config relative to Git dir 2022-09-05 07:20:30 +01:00
d0fe2baafe Store persistent database elsewhere 2022-09-05 07:20:30 +01:00
e092327932 Improve DB performance with caching 2022-09-05 07:20:30 +01:00
8b9ad05089 Reformat legacy project 2022-09-05 07:20:30 +01:00
6b082adeb2 Merge branch 'threshold' 2022-09-06 12:50:25 +01:00
bd9f9378cf Moved files to subdirectory 2022-09-06 12:50:09 +01:00
62fe03a6cb Increase thread delay time 2022-09-05 07:20:30 +01:00
297bbbe035 Alter schemas and 4chan performance settings 2022-09-05 07:20:30 +01:00
ed7c439b56 Remove some debugging code 2022-09-05 07:20:30 +01:00
ecb8079b5b Change Python to 3.10 2022-09-05 07:20:30 +01:00
6811ce4af5 Update production env file path 2022-09-05 07:20:30 +01:00
e34d281774 Remove development dotenv loading 2022-09-05 07:20:30 +01:00
91e18c60e6 Add debug statement 2022-09-05 07:20:30 +01:00
9c9d49dcd2 Reformat and set the net and channel for 4chan 2022-09-05 07:20:30 +01:00
dcd648e1d2 Make crawler more efficient and implement configurable parameters 2022-09-05 07:20:30 +01:00
318a8ddbd5 Split thread list into chunks to save memory 2022-09-05 07:20:30 +01:00
20e22ae7ca Reformat code 2022-09-04 21:40:04 +01:00
8feccbbf00 Reinstate Redis cache 2022-09-04 21:38:53 +01:00
db46fea550 Run processing in thread 2022-09-04 21:29:00 +01:00
22cef33342 Implement aiohttp 2022-09-04 19:44:25 +01:00
663a26778d Begin implementing aiohttp 2022-09-04 13:47:32 +01:00
36de004ee5 Implement running Discord and 4chan gathering simultaneously 2022-09-02 22:30:45 +01:00
2c3d83fe9a Fix error when no email can be found 2022-08-27 11:19:28 +01:00
d7adffb47f Fix getting first relay when they are not sequential 2022-08-26 22:17:12 +01:00
4f4820818a Log authentication messages 2022-08-16 23:01:42 +01:00
5cc38da00e Implement deduplicating channels 2022-08-16 22:01:35 +01:00
a4dae2a583 Switch to siphash 2022-08-18 07:20:30 +01:00
5f1667869f Re-add fake messages 2022-08-15 19:49:21 +01:00
09a5cd14ad Detect queries if nick and channel are the same 2022-08-15 19:24:42 +01:00
96de70aaf2 Add sinst fetch and fix message send logic 2022-08-15 19:15:12 +01:00
f8c1e952bb Switch debugging statements to trace in ChanKeep 2022-08-15 19:15:00 +01:00
36628e157d Fix query handling and don't send a fake message 2022-08-15 17:59:31 +01:00
aeee745ac9 Only run pingmsg after negative has completed 2022-08-18 07:20:30 +01:00
d795af164f Fix debug statements and amend function names 2022-08-18 07:20:30 +01:00
4acadd3508 Properly format string 2022-08-18 07:20:30 +01:00
5c4904ba56 Improve regPing debugging 2022-08-18 07:20:30 +01:00
4e88b93856 Improve regPing negative handling logic 2022-08-18 07:20:30 +01:00
af1dba5741 Fix double messages and regPing logic 2022-08-18 07:20:30 +01:00
553e2eb2b7 Set the channel limit on connected relays, not active 2022-08-18 07:20:30 +01:00
3dfc6d736a Look before you leap to confirming registrations 2022-08-18 07:20:30 +01:00
7ef76d1424 Fix IRC config mutation 2022-08-18 07:20:30 +01:00
d78600a2f1 Change authentication endpoint 2022-08-18 07:20:30 +01:00
f004bd47af Reorder API endpoints to prevent clashing 2022-08-18 07:20:30 +01:00
fafcff1427 Add more debugging information 2022-08-15 00:39:22 +01:00
e56bd61362 Figure out the channel parsing logic 2022-08-15 00:36:36 +01:00
2b7bd486f1 Pass a list instead of listinfo 2022-08-15 00:29:08 +01:00
a9592a85d0 Fix variable placement 2022-08-15 00:27:16 +01:00
e77c046965 Fix list parsing 2022-08-15 00:26:11 +01:00
7a8cee1431 Fix debugging code in keepChannels 2022-08-15 00:08:11 +01:00
e6527b4f9f Add debugging code in keepChannels 2022-08-15 00:07:29 +01:00
8979a03bbd Subtract one from length of list for indices 2022-08-15 00:04:49 +01:00
f7b84913f2 Lower max_chans to length of LIST if it's shorter 2022-08-15 00:03:12 +01:00
d46c98a211 Reset negative pass status when requesting recheck 2022-08-14 23:58:35 +01:00
d68f0589cb Implement initial WHO loop delay 2022-08-14 20:58:41 +01:00
d9ec68708b Fix getting all unregistered relays 2022-08-14 20:58:30 +01:00
1b77c50552 Blacklist channels we are kicked from 2022-08-14 20:44:04 +01:00
1ce5a8228c Use JSON for sending messages 2022-08-14 16:45:40 +01:00
f6f515b308 Implement API call to register 2022-08-14 16:26:09 +01:00
9864b4e2b5 Convert num to number in registration confirmation 2022-08-14 16:09:32 +01:00
2fdd0cf6b8 Allow current nick substitution in IRC commands 2022-08-14 15:53:18 +01:00
8c809ad444 Fix variable shadowing 2022-08-14 15:43:48 +01:00
2022ab985b Print identification message 2022-08-14 13:51:13 +01:00
b5e78bc4de Implement manual authentication mode 2022-08-14 13:13:05 +01:00
eba2c387f0 Implement API for authentication management actions 2022-08-14 12:43:33 +01:00
5123941c79 More debugging for reg tests and getstr command 2022-08-14 11:41:29 +01:00
6cc07c9171 Add allRelaysActive output to network info 2022-08-14 10:58:28 +01:00
ed1f3cdca7 Add debug statements and only check if network is connected when parting channels 2022-08-14 09:25:54 +01:00
128e005611 Use JSON for joining channels and don't shadow auth variable when getting network info 2022-08-14 09:25:01 +01:00
713e03b66e Make channel deletion endpoint accept JSON 2022-08-14 00:01:14 +01:00
a0761ff1ae LBYL 2022-08-13 23:38:13 +01:00
15523bed96 Add more information to relay API return 2022-08-13 23:36:39 +01:00
653d9ea4f9 Add even more debugging 2022-08-13 23:18:56 +01:00
f1229a76e1 Extra debugging for getting active relays 2022-08-13 23:17:26 +01:00
d4bcbf99e5 Fix typo in module name 2022-08-13 23:14:51 +01:00
e517d04095 Extra debugging for get_first_relay 2022-08-13 23:14:17 +01:00
65697ce8f0 Filter queries more carefully 2022-08-13 22:46:10 +01:00
ab9b0a1c9f Update CHANLIMIT on all instances when set via API 2022-08-13 22:36:52 +01:00
60f7a84383 Add helper to get all active relays 2022-08-13 22:36:18 +01:00
956d328fd3 Implement API endpoint to enable authentication 2022-08-13 22:25:29 +01:00
dcd7fcc3c0 Filter AUTH channel (OFTC fix) 2022-08-13 22:15:50 +01:00
7415ca5556 Use ChanKeep system for joining channels with joinSingle 2022-08-13 21:54:14 +01:00
9780a2dfc8 Fully make use of ECA for multiple channels 2022-08-13 21:40:53 +01:00
c7fa508a38 Return chanlimit for each relay 2022-08-13 21:22:43 +01:00
b83062c34f Check token before attempting to confirm 2022-08-13 20:55:36 +01:00
2e57e0930a Implement API endpoint for provisioning relays 2022-08-13 20:51:31 +01:00
43c5625b3b Implement configurable chanlimit and add more fields about LIST output to Redis 2022-08-13 20:37:21 +01:00
291968fbc7 Implement updating registration via API 2022-08-13 20:36:51 +01:00
dd67e9cc8b Implement ChanKeep without requiring persistent chanlimits on all networks 2022-08-13 19:20:29 +01:00
c145e5cf18 Add some debug statements and statistics for chanlimits 2022-08-13 18:40:13 +01:00
5db0373731 Print message if relay is unauthenticated/disconnected 2022-08-13 14:06:34 +01:00
6c11bbe912 Return relay numbers with channel list 2022-08-13 13:47:42 +01:00
4d543f31ec Add connected status to IRC info return and check when getting active relays 2022-08-13 13:40:33 +01:00
6c92e8e7d9 Reformat code 2022-08-13 13:32:22 +01:00
836e621063 Implement getting LIST information from API 2022-08-13 13:27:20 +01:00
852d62a9c9 Provision relay on creation 2022-08-13 00:18:06 +01:00
ddc9af0ddf Add docstrings to chankeep 2022-08-12 23:53:02 +01:00
edfb3f15eb Implement migrating networks 2022-08-12 23:32:00 +01:00
14967f662c Subtract allocated channel slots from total 2022-08-12 22:31:12 +01:00
0b370fc155 Improve channel allocation and write basic tests for it 2022-08-12 22:27:49 +01:00
9804f30060 Make channel join notification a TRACE 2022-08-12 20:19:39 +01:00
f7d6cec896 Fix email command 2022-08-12 20:19:33 +01:00
b871fea039 Add endpoint to get the bot's nickname 2022-08-09 07:20:30 +01:00
e69ce5090a Properly implement querying with API 2022-08-09 07:20:30 +01:00
813c9baf30 Get our hostname from WHO when we create fake events 2022-08-09 07:20:30 +01:00
220ce976f2 Fire a fake event when we send a message 2022-08-09 07:20:30 +01:00
719f014265 Implement best effort allocation 2022-08-11 21:44:19 +01:00
1ef600a9df Simplify variable names and reformat 2022-08-11 20:51:41 +01:00
b72a0672a5 Use ceil instead of round for relay number rounding 2022-08-11 20:46:44 +01:00
bb3b96e7f7 Expand ECA secondary allocation algorithm 2022-08-11 20:43:34 +01:00
c4db8ec99d Adding more debug statements in ECA system 2022-08-11 20:36:24 +01:00
73b0518a8f Print information about received LIST 2022-08-11 20:32:49 +01:00
571a527f43 Return correct data type for provisioning relays 2022-08-11 20:29:01 +01:00
4c3bab6d96 Simplify is_first_relay 2022-08-11 20:26:19 +01:00
14eb05722c Add even more debugging 2022-08-11 20:21:39 +01:00
11c226833d Add more LIST handling debugging 2022-08-11 20:18:49 +01:00
ea81fc80e3 Don't add 1 to current relays when iterating 2022-08-11 20:13:30 +01:00
8cd22888b7 Add extra debug call for allRelaysActive 2022-08-11 20:12:38 +01:00
ba4b8c7501 Reformat helpers 2022-08-11 20:09:14 +01:00
0666c4a153 Enable debug mode with env vars 2022-08-11 20:09:01 +01:00
2a5e6766be Update IRC template 2022-08-11 19:49:58 +01:00
c983a8e3b6 Allow gaps in relay numbering 2022-08-11 19:22:09 +01:00
a3fe92bea9 Implement deleting networks 2022-08-02 09:01:34 +01:00
9b03485b69 More error handling when joining channels with ChanKeep 2022-08-02 09:01:24 +01:00
98dcb99f90 Implement adding networks 2022-08-01 23:02:20 +01:00
aa68bfd9be Implement requesting channel list for network 2022-08-01 21:38:46 +01:00
f3f717e693 Remove debugging code 2022-08-01 21:31:48 +01:00
864f0904f5 Implement automatic provisioning 2022-08-01 19:34:35 +01:00
b72d3d67a1 Implement updating aliases 2022-08-01 19:05:12 +01:00
96d189290b Implement API endpoint to add next relay 2022-07-29 22:39:08 +01:00
c950bcbd43 Implement deleeting relays and fix adding 2022-07-29 22:11:43 +01:00
4472352785 Reformat code 2022-07-29 17:28:19 +01:00
75f79cf072 Fix joining channels with inactive relays 2022-07-29 17:28:09 +01:00
1ca6d79868 Implement creating relays via the API 2022-07-29 17:27:40 +01:00
33466b90ba Fix Redis config path 2022-07-29 22:22:22 +01:00
659d5b391b Use proper port for SSL listener 2022-07-29 22:22:22 +01:00
6e1dfecc95 Disable RelayAPI by default in stack file 2022-07-29 22:22:22 +01:00
3354a94024 Add stack example to test production 2022-07-29 09:09:08 +01:00
a5b25b2048 Use Git dir to make redis config absolute path 2022-07-29 09:06:13 +01:00
1f51bf2972 Use paths relative to root in production compose 2022-07-29 09:04:18 +01:00
6e41c8dfc0 Switch paths 2022-07-29 09:00:08 +01:00
ce0b26577f Use relative paths 2022-07-29 08:59:02 +01:00
335e602072 Fix redis.conf location in prod compose 2022-07-29 08:48:30 +01:00
1fcc9d6643 Don't pass template directory 2022-07-29 08:35:56 +01:00
1ab9824e95 Fix path issue 2022-07-29 08:32:39 +01:00
47312b04d4 Pass through configuration directories to compose 2022-07-29 08:31:01 +01:00
743c1d6be8 Fix environment variable path on production compose 2022-07-29 08:11:37 +01:00
1b60ec62f6 Properly configure production compose file 2022-07-29 08:02:10 +01:00
94303b1108 Create separate production configuration 2022-07-29 08:01:48 +01:00
219fc8ac35 Remove print statements 2022-07-28 21:30:23 +01:00
c5604c0ca8 Add trailing slash to example directory 2022-07-28 21:29:08 +01:00
f9482cac63 Add Portainer Git directory to env file 2022-07-28 21:27:26 +01:00
a61ba7b9e1 Seamlessly handle nonexistent configurations 2022-07-28 21:11:01 +01:00
b3dce50ce4 Add stack.env file 2022-07-28 19:57:26 +01:00
7eee2ec929 Move env file to example 2022-07-28 19:50:48 +01:00
2ad61e6afa Properly pass environment variables to the process 2022-07-28 19:50:07 +01:00
a598bbab4b Make some addresses and hosts configurable with environment variables 2022-07-28 19:38:37 +01:00
422d3d4cdc Lower compose version 2022-07-28 19:25:15 +01:00
2b4e037b51 Add docker definitions 2022-07-28 19:21:08 +01:00
15583bdaab Implement relay, channel and alias management 2022-07-27 22:03:42 +01:00
8050484b6f Implement editing networks via the API 2022-07-27 08:59:17 +01:00
4f141b976a Implement network and channels view 2022-07-26 22:16:35 +01:00
c302cd25da Implement API endpoint for network listing 2022-07-25 18:05:53 +01:00
24a2f79e8e Don't send to Logstash if it's disabled 2022-07-21 13:40:40 +01:00
8c9ec3ab9c Implement getting number of channels and users 2022-07-21 13:40:18 +01:00
a8d0a7d886 Implement more API functions 2022-07-21 13:40:17 +01:00
e3e150c805 Update config 2022-07-21 13:40:15 +01:00
071d6f4579 Implement API 2022-07-21 13:40:13 +01:00
4a8605626a Begin work on API endpoint 2022-07-21 13:40:11 +01:00
80c016761f Reformat again 2022-07-21 13:40:09 +01:00
7a0e2be66c Remove some legacy code 2022-07-21 13:40:07 +01:00
2fecd98978 Reformat project 2022-07-21 13:40:05 +01:00
4ecb37b179 Reformat and fix circular import 2022-07-21 13:40:03 +01:00
27cafa1def Revert "Reformat project"
This reverts commit 64e3e1160aa76d191740342ab3edc68807f890fb.
2022-07-21 13:40:01 +01:00
da678617d8 Reformat project 2022-07-21 13:39:59 +01:00
4669096fcb Don't attempt secondary registration if it is disabled 2022-07-21 13:39:57 +01:00
404fdb000f Don't attempt to register if it is disabled 2022-07-21 13:39:56 +01:00
2177766d90 Rename time to ts 2022-07-21 13:39:54 +01:00
4734a271a1 Extra error handling around emails 2022-07-21 13:39:52 +01:00
ef3151f34c Make Redis DBs configurable 2022-07-21 13:39:50 +01:00
8442c799be Add Redis DB numbers to configuration 2022-07-21 13:39:48 +01:00
e0f86ec853 Fix provisioning with emails 2022-07-21 13:39:46 +01:00
f88e6dec5a Fix some issues with the default config 2022-07-21 13:39:44 +01:00
4ff111a216 Improve email command 2022-07-21 13:39:43 +01:00
7c855e09c0 Reformat code with pre-commit 2022-07-21 13:39:41 +01:00
Mark Veidemanis
61f6715b20 Start implementing email command 2021-08-25 07:47:54 +00:00
Mark Veidemanis
0854c6d60d Add Logstash file 2021-08-24 20:08:18 +00:00
Mark Veidemanis
5179c43972 Implement modifying emails for aliases 2021-06-06 10:31:13 +00:00
Mark Veidemanis
7439d97c71 Finish Logstash implementation 2021-06-06 10:16:04 +00:00
Mark Veidemanis
391f917b38 Update requirements without versions 2021-06-06 10:13:43 +00:00
2686e4ab04 Merge branch 'master' into datarestructure 2020-11-02 20:18:36 +00:00
08b5dc06f0 Implement relay-independent join 2020-11-02 20:14:02 +00:00
5deb0649fb Don't discard server messages 2020-11-02 20:13:36 +00:00
9959231d50 Use substitutions in registration tests 2020-11-01 22:19:03 +00:00
73e596dac3 Additional error handling for command parsing 2020-11-01 22:18:48 +00:00
be405160e4 Fix bug with reg command 2020-11-01 20:43:51 +00:00
7489512a82 Add example file for blacklist 2020-11-01 19:55:32 +00:00
1f178a20ed Implement channel blacklisting 2020-11-01 19:54:24 +00:00
cb21ad8fca Fix bug with using muser attribute when absent 2020-11-01 19:03:56 +00:00
c10274ccd6 Fix syntax error in reg command 2020-11-01 18:50:17 +00:00
9fd6688892 Implement setting modes in ZNC 2020-11-01 03:39:32 +00:00
f54a448d54 Prepare command loader for reloading commands 2020-11-01 03:38:47 +00:00
fe52561b71 Implement registration at net-level 2020-11-01 03:37:29 +00:00
09405f374e Clarify message output on confirm command 2020-11-01 03:36:23 +00:00
16ab37cc0c Log error when ZNC says a channel can't be joined 2020-10-31 23:58:51 +00:00
fc3a349cb3 Fix registration cancellation bug in regproc 2020-10-31 23:58:03 +00:00
fe86d30155 Fix various bugs and off by one with provisioning 2020-10-31 23:55:11 +00:00
7485bbefd1 Move WHO and NAMES logging to trace 2020-10-31 16:52:00 +00:00
82a98c9539 Don't deduplicate global messages (NICK/QUIT) 2020-10-31 16:51:24 +00:00
45f02c323b Improve authentication detection
Add a negative check in the event we are authenticated and registered,
but not confirmed, as this fools other checks.
2020-10-31 16:49:37 +00:00
bdb3d059e3 Use zero-padded numbers to maximise usuable ports 2020-10-31 00:13:59 +00:00
e403852778 Error checking in testing for registration message 2020-10-31 00:13:09 +00:00
f3dd102096 Deauth bot when disconnected and lowercase user 2020-10-31 00:12:06 +00:00
1fec14d759 Clarify error message to be more helpful 2020-10-31 00:11:28 +00:00
b67eee42c1 Implement another level of logging for tracing 2020-10-31 00:10:33 +00:00
9e6dd5e03d Note that arguments to list are optional 2020-10-31 00:06:35 +00:00
77e8ef4c16 Implement authentication checking on connection 2020-10-28 22:50:12 +00:00
c879caa9d7 Add checks in dedup for time-less messages 2020-10-28 22:46:22 +00:00
db7e5677d3 Fix decoding issue with some Redis keys 2020-10-28 22:30:49 +00:00
f848b5afd6 Provision users with lowercase names 2020-10-28 22:30:04 +00:00
3bc65f8456 Add the time field to some notifications 2020-10-28 22:26:41 +00:00
95ee63e399 Fix circular import in ChanKeep/provisioning modules 2020-10-28 18:38:27 +00:00
a1e045793c Start implementing prefixes 2020-07-09 19:43:47 +01:00
f50a40d207 Fixes to auth detection and message parsing
* don't check authentication if the network doesn't need to
  register
* don't pass through muser for ZNC type messages
* avoid duplicate message for queries containing highlights
* make a copy of the cast for metadata analysis to avoid poisoning it
* set up callback for when the instance is authenticated, so we can
  request a LIST immediately if so desired
* separate out seeding functions to populate CHANLIMIT to ease future
  work involving other options, such as PREFIX
2020-06-07 17:26:53 +01:00
4c08225a50 Remove condition-based monitoring system 2020-06-07 15:31:43 +01:00
11f15ac960 Fix various bugs in the event system
Squash many bugs in the event notification system and simplify the
code.
2020-06-02 21:34:15 +01:00
8103c16253 Fix syntax error in redis query 2020-05-31 21:54:43 +01:00
45070b06e2 Implement authentication detection
* pending command to see which instances have never authenticated
* authcheck command to see which instances are not currently
  authenticated
2020-05-31 21:52:56 +01:00
12db2f349e Add help for pending command 2020-05-31 16:40:51 +01:00
40e1f38508 Add additional error handling in user queries 2020-05-31 13:44:34 +01:00
63c97db12e Function to select and merge IRC network defs 2020-05-31 13:23:09 +01:00
91885170f1 Check registration status before joining channels
Do not join channels if any relay for a network is unregistered.
2020-05-31 13:09:58 +01:00
7c23766763 Allow sending LIST to all networks at once 2020-05-31 13:08:00 +01:00
9e62ac62bc Add confirm command
Confirm command to check which relays need manual
confirmation.
2020-05-31 12:32:12 +01:00
014de9f958 Remove leftover irc.json file 2020-05-30 21:42:26 +01:00
f90f2fdef7 Implement registration and confirmation of nicks 2020-05-30 21:40:10 +01:00
e0549cdd30 Restructure provisioning into fewer functions 2020-05-30 21:37:22 +01:00
a78229a288 Add irc.json to gitignore 2020-05-30 21:35:50 +01:00
918d410927 Fix variable scope in LIST error handling 2020-04-21 23:32:17 +01:00
bc4d5cba8e Separate provisioning into user and auth info 2019-12-28 17:51:03 +00:00
376d1bd911 Add IRC network definitions 2019-12-28 17:50:38 +00:00
778690ae3a Add more comments and remove obsolete code 2019-12-07 16:35:29 +00:00
da3ba4ea8c Add requirements 2019-11-17 19:09:17 +00:00
46 changed files with 2391 additions and 1200 deletions

1
.gitignore vendored
View File

@@ -159,4 +159,3 @@ docker/data
*.pem
legacy/conf/live/
legacy/conf/cert/
stack.env

View File

@@ -1,21 +1,15 @@
repos:
- repo: https://github.com/psf/black
rev: 23.1.0
rev: 22.6.0
hooks:
- id: black
exclude: ^core/migrations
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
rev: 5.10.1
hooks:
- id: isort
args: ["--profile", "black"]
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 4.0.1
hooks:
- id: flake8
args: [--max-line-length=88]
exclude: ^core/migrations
- repo: https://github.com/sirwart/ripsecrets.git
rev: v0.1.5
hooks:
- id: ripsecrets

View File

@@ -1,20 +0,0 @@
run:
docker-compose -f docker-compose.prod.yml --env-file=stack.env up -d
build:
docker-compose -f docker-compose.prod.yml --env-file=stack.env build
stop:
docker-compose -f docker-compose.prod.yml --env-file=stack.env down
log:
docker-compose -f docker-compose.prod.yml --env-file=stack.env logs -f --names
run-infra:
docker-compose -f docker-compose.infra.yml --env-file=stack.env up -d
stop-infra:
docker-compose -f docker-compose.infra.yml --env-file=stack.env down
log-infra:
docker-compose -f docker-compose.infra.yml --env-file=stack.env logs -f

253
db.py
View File

@@ -1,52 +1,23 @@
import asyncio
from math import ceil
from os import getenv
from time import sleep
import random
import aiomysql
import aioredis
import manticoresearch
import msgpack
import orjson
from manticoresearch.rest import ApiException
from numpy import array_split
# Kafka
from aiokafka import AIOKafkaProducer
from redis import StrictRedis
import util
from schemas import mc_s
mysql_pool = None
configuration = manticoresearch.Configuration(host="http://127.0.0.1:9308")
api_client = manticoresearch.ApiClient(configuration)
api_instance = manticoresearch.IndexApi(api_client)
# KAFKA_TOPIC = "msg"
log = util.get_logger("db")
# Redis (legacy)
# r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)
r = StrictRedis(
host="127.0.0.1", # Replace with your Redis server's IP address
port=1289, # Replace with your Redis server's port
db=0, # Database number
)
r = StrictRedis(unix_socket_path="/var/run/redis/redis.sock", db=0)
# AIORedis
# ar = aioredis.from_url("unix:///var/run/redis/redis.sock", db=0)
ar = aioredis.from_url("redis://127.0.0.1:1289", db=0)
# /var/run/neptune-redis.sock
# db = 10
pr = aioredis.from_url("unix://var/run/neptune-redis.sock", db=10)
# fr = aioredis.from_url("unix://var/run/fisk-redis.sock", db=10)
fr = aioredis.from_url("unix://var/run/redis.sock", db=10)
# pr = aioredis.from_url("redis://redis_neptune:6379", db=10, password=getenv("REDIS_PASSWORD"))
KEYNAME = "queue"
MESSAGE_KEY = "messages"
OHLC_MESSAGE_KEY = "ohlc"
ar = aioredis.from_url("unix:///var/run/redis/redis.sock", db=0)
TYPES_MAIN = [
"msg",
@@ -61,174 +32,89 @@ TYPES_MAIN = [
"topic",
"update",
]
MAIN_SRC_MAP = {
"dis": "main",
"irc": "restricted",
"4ch": "main",
}
TYPES_META = ["who"]
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
KEYNAME = "queue"
async def init_mysql_pool():
"""
Initialize the MySQL connection pool.
"""
global mysql_pool
mysql_pool = await aiomysql.create_pool(
host="127.0.0.1", port=9306, db="Manticore", minsize=1, maxsize=10
)
async def rts_store_message(index, data):
"""
Store a RTS message into MySQL using an existing connection pool.
Prioritizes instant PubSub delivery, with minimal data storage overhead.
:param index: str
:param data: dict
"""
# Publish to Redis PubSub
packed_index = msgpack.packb({"index": index, "data": data}, use_bin_type=True)
try:
await fr.publish(OHLC_MESSAGE_KEY, packed_index)
except aioredis.exceptions.ConnectionError as e:
raise e
await asyncio.sleep(0.1)
# Insert data into MySQL
try:
async with mysql_pool.acquire() as conn:
async with conn.cursor() as cur:
# Insert data into the table
query = f"""
INSERT INTO {index} (s, o, c, h, l, v, a, i, t, t2, ts)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
# Bind the values directly
await cur.execute(
query,
(
data["s"], # symbol
data["o"], # open
data["c"], # close
data["h"], # high
data["l"], # low
data["v"], # volume_base
data["a"], # volume_quote
data["i"], # interval
data["t"], # start_time
data["t2"], # end_time
data["ts"], # event_time
),
)
await conn.commit()
log.debug(f"Stored data for {data['s']} in MySQL.")
except aiomysql.Error as e:
log.error(f"MySQL error: {e}")
async def store_batch(data):
"""
Store a message into Manticore
:param data: list
"""
if not data:
return
# 10000: maximum inserts we can submit to
# Manticore as of Sept 2022
split_posts = array_split(data, ceil(len(data) / 10000))
for messages in split_posts:
total = []
indexmap = {}
for msg in messages:
async def store_kafka_batch(data):
# log.debug(f"Storing Kafka batch of {len(data)} messages")
producer = AIOKafkaProducer(bootstrap_servers="kafka:9092")
await producer.start()
topicmap = {}
for msg in data:
if msg["type"] in TYPES_MAIN:
index = "main"
schema = mc_s.schema_main
# index = "main"
index = MAIN_SRC_MAP[msg["src"]]
# schema = mc_s.schema_main
elif msg["type"] in TYPES_META:
index = "meta"
schema = mc_s.schema_meta
# schema = mc_s.schema_meta
elif msg["type"] in TYPES_INT:
index = "internal"
schema = mc_s.schema_int
# normalise fields
for key, value in list(msg.items()):
if value is None:
del msg[key]
if key in schema:
if isinstance(value, int):
if schema[key].startswith("string") or schema[key].startswith(
"text"
):
msg[key] = str(value)
# schema = mc_s.schema_int
body = {"insert": {"index": index, "doc": msg}}
total.append(body)
KAFKA_TOPIC = index
# if key in schema:
# if isinstance(value, int):
# if schema[key].startswith("string") or schema[key].startswith(
# "text"
# ):
# msg[key] = str(value)
body = orjson.dumps(msg)
if "ts" not in msg:
raise Exception("No TS in msg")
if index not in indexmap:
indexmap[index] = [msg]
if KAFKA_TOPIC not in topicmap:
topicmap[KAFKA_TOPIC] = [body]
else:
indexmap[index].append(msg)
# END MSG IN MESSAGES
topicmap[KAFKA_TOPIC].append(body)
# Pack the indexmap with msgpack and publish it to Neptune
packed_index = msgpack.packb(indexmap, use_bin_type=True)
completed_publish = False
for i in range(10):
if completed_publish:
break
try:
await pr.publish(MESSAGE_KEY, packed_index)
completed_publish = True
except aioredis.exceptions.ConnectionError as e:
raise e
await asyncio.sleep(0.1)
if not completed_publish:
log.error("Failed to publish to Neptune")
for topic, messages in topicmap.items():
batch = producer.create_batch()
for body in messages:
metadata = batch.append(key=None, value=body, timestamp=msg["ts"])
if metadata is None:
partitions = await producer.partitions_for(topic)
partition = random.choice(tuple(partitions))
await producer.send_batch(batch, topic, partition=partition)
# log.debug(
# (
# f"{batch.record_count()} messages sent to topic "
# f"{topic} partition {partition}"
# )
# )
batch = producer.create_batch()
continue
body_post = ""
for item in total:
# print("ITEM", item)
body_post += orjson.dumps(item).decode("utf-8")
body_post += "\n"
# print("BODY POST INDEX", index, body_post)
try:
# Bulk index operations
api_response = api_instance.bulk(body_post) # , async_req=True
except ApiException as e:
log.error("Exception when calling IndexApi->bulk: %s\n" % e)
log.error("body_post attempted to send", body_post)
log.info(f"Completed ingest to MC of length {len(total)}")
# END MESSAGES IN SPLIT
def update_schema():
pass
def create_index(api_client):
util_instance = manticoresearch.UtilsApi(api_client)
schemas = {
"main": mc_s.schema_main,
"rule_storage": mc_s.schema_rule_storage,
"meta": mc_s.schema_meta,
"internal": mc_s.schema_int,
}
for name, schema in schemas.items():
schema_types = ", ".join([f"{k} {v}" for k, v in schema.items()])
create_query = (
f"create table if not exists {name}({schema_types}) engine='columnar'"
partitions = await producer.partitions_for(topic)
partition = random.choice(tuple(partitions))
await producer.send_batch(batch, topic, partition=partition)
# log.debug(
# (
# f"{batch.record_count()} messages sent to topic "
# f"{topic} partition {partition}"
# )
# )
log.debug(
"Kafka batches sent: "
+ ", ".join([topic + ": " + str(len(topicmap[topic])) for topic in topicmap])
)
print("Schema types", create_query)
util_instance.sql(create_query)
await producer.stop()
async def queue_message(msg):
"""
Queue a message on the Redis buffer.
"""
# TODO: msgpack
message = orjson.dumps(msg)
await ar.lpush(KEYNAME, message)
await ar.sadd(KEYNAME, message)
async def queue_message_bulk(data):
@@ -236,6 +122,5 @@ async def queue_message_bulk(data):
Queue multiple messages on the Redis buffer.
"""
for msg in data:
# TODO: msgpack
message = orjson.dumps(msg)
await ar.lpush(KEYNAME, message)
await ar.sadd(KEYNAME, message)

View File

@@ -1,174 +0,0 @@
import asyncio
from os import getenv
import aioredis
import msgpack
import orjson
import redis
# Elasticsearch
from elasticsearch import AsyncElasticsearch
import util
trues = ("true", "1", "t", True)
# INDEX = "msg"
log = util.get_logger("db")
# Redis (legacy)
# r = redis.from_url("redis://ssdb:1289", db=0)
# AIORedis
ar = aioredis.from_url("redis://ssdb:1289", db=0)
# Neptune redis for PubSub
pr = aioredis.from_url("redis://redis_neptune:6379", db=10)
TYPES_MAIN = [
"msg",
"notice",
"action",
"part",
"join",
"kick",
"quit",
"nick",
"mode",
"topic",
"update",
]
MAIN_SRC_MAP = {
"dis": "main",
"irc": "restricted",
"4ch": "main",
}
TYPES_META = ["who"]
TYPES_INT = ["conn", "highlight", "znc", "query", "self"]
KEYNAME = "queue"
MESSAGE_KEY = "messages"
ELASTICSEARCH_USERNAME = getenv("ELASTICSEARCH_USERNAME", "elastic")
ELASTICSEARCH_PASSWORD = getenv("ELASTICSEARCH_PASSWORD", "changeme")
ELASTICSEARCH_HOST = getenv("ELASTICSEARCH_HOST", "localhost")
ELASTICSEARCH_TLS = getenv("ELASTICSEARCH_TLS", "false") in trues
client = None
# These are sometimes numeric, sometimes strings.
# If they are seen to be numeric first, ES will erroneously
# index them as "long" and then subsequently fail to index messages
# with strings in the field.
keyword_fields = ["nick_id", "user_id", "net_id"]
mapping_int = {
"mappings": {
"properties": {
"ts": {"type": "date", "format": "epoch_second"},
"file_tim": {"type": "date", "format": "epoch_millis"},
}
}
}
mapping = dict(mapping_int)
for field in keyword_fields:
mapping["mappings"]["properties"][field] = {"type": "text"}
del mapping_int["mappings"]["properties"]["file_tim"]
async def initialise_elasticsearch():
"""
Initialise the Elasticsearch client.
"""
auth = (ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD)
client = AsyncElasticsearch(ELASTICSEARCH_HOST, http_auth=auth, verify_certs=False)
for index in ("main", "meta", "restricted", "internal"):
if index == "internal":
map_dict = mapping_int
else:
map_dict = mapping
if await client.indices.exists(index=index):
# update index with mapping
await client.indices.put_mapping(
index=index, properties=map_dict["mappings"]["properties"]
)
else:
await client.indices.create(index=index, mappings=map_dict["mappings"])
return client
async def store_batch(data):
global client
if not client:
client = await initialise_elasticsearch()
indexmap = {}
for msg in data:
if msg["type"] in TYPES_MAIN:
# index = "main"
index = MAIN_SRC_MAP[msg["src"]]
# schema = mc_s.schema_main
elif msg["type"] in TYPES_META:
index = "meta"
# schema = mc_s.schema_meta
elif msg["type"] in TYPES_INT:
index = "internal"
# schema = mc_s.schema_int
INDEX = index
# if key in schema:
# if isinstance(value, int):
# if schema[key].startswith("string") or schema[key].startswith(
# "text"
# ):
# msg[key] = str(value)
# body = orjson.dumps(msg)
if "ts" not in msg:
raise Exception("No TS in msg")
if INDEX not in indexmap:
indexmap[INDEX] = [msg]
else:
indexmap[INDEX].append(msg)
# Pack the indexmap with msgpack and publish it to Neptune
packed_index = msgpack.packb(indexmap, use_bin_type=True)
completed_publish = False
for i in range(10):
if completed_publish:
break
try:
await pr.publish(MESSAGE_KEY, packed_index)
completed_publish = True
except aioredis.exceptions.ConnectionError:
await asyncio.sleep(0.1)
if not completed_publish:
log.error("Failed to publish to Neptune")
for index, index_messages in indexmap.items():
for message in index_messages:
result = await client.index(index=index, body=message)
if not result["result"] == "created":
log.error(f"Indexing failed: {result}")
log.debug(f"Indexed {len(data)} messages in ES")
async def queue_message(msg):
"""
Queue a message on the Redis buffer.
"""
# TODO: msgpack
message = orjson.dumps(msg)
await ar.lpush(KEYNAME, message)
async def queue_message_bulk(data):
"""
Queue multiple messages on the Redis buffer.
"""
for msg in data:
# TODO: msgpack
message = orjson.dumps(msg)
await ar.lpush(KEYNAME, message)

View File

@@ -1,206 +0,0 @@
version: "2.2"
services:
rts:
image: xf/monolith:latest
container_name: rts_monolith
command: sh -c '. /venv/bin/activate && exec python rts.py'
build: .
volumes:
- ${PORTAINER_GIT_DIR}:/code
- type: bind
source: /code/run
target: /var/run
environment:
PORTAINER_GIT_DIR: "${PORTAINER_GIT_DIR}"
MODULES_ENABLED: "${MODULES_ENABLED}"
MONOLITH_RTS_MEXC_API_ACCESS_KEY: "${MONOLITH_RTS_MEXC_API_ACCESS_KEY}"
MONOLITH_RTS_MEXC_API_SECRET_KEY: "${MONOLITH_RTS_MEXC_API_SECRET_KEY}"
deploy:
resources:
limits:
cpus: '0.5'
memory: 1.0G
network_mode: host
app:
image: xf/monolith:latest
container_name: monolith
#command: sh -c '. /venv/bin/activate && exec python -m cProfile -o /tmp/profile.out monolith.py'
build: .
volumes:
- ${PORTAINER_GIT_DIR}:/code
- type: bind
source: /code/run
target: /var/run
environment:
PORTAINER_GIT_DIR: "${PORTAINER_GIT_DIR}"
MODULES_ENABLED: "${MODULES_ENABLED}"
DISCORD_TOKEN: "${DISCORD_TOKEN}"
THRESHOLD_LISTENER_HOST: "${THRESHOLD_LISTENER_HOST}"
THRESHOLD_LISTENER_PORT: "${THRESHOLD_LISTENER_PORT}"
THRESHOLD_LISTENER_SSL: "${THRESHOLD_LISTENER_SSL}"
THRESHOLD_RELAY_ENABLED: "${THRESHOLD_RELAY_ENABLED}"
THRESHOLD_RELAY_HOST: "${THRESHOLD_RELAY_HOST}"
THRESHOLD_RELAY_PORT: "${THRESHOLD_RELAY_PORT}"
THRESHOLD_RELAY_SSL: "${THRESHOLD_RELAY_SSL}"
THRESHOLD_API_ENABLED: "${THRESHOLD_API_ENABLED}"
THRESHOLD_API_HOST: "${THRESHOLD_API_HOST}"
THRESHOLD_API_PORT: "${THRESHOLD_API_PORT}"
THRESHOLD_CONFIG_DIR: "${THRESHOLD_CONFIG_DIR}"
#THRESHOLD_TEMPLATE_DIR: "${#THRESHOLD_TEMPLATE_DIR}"
THRESHOLD_CERT_DIR: "${THRESHOLD_CERT_DIR}"
# How many messages to ingest at once from Redis
MONOLITH_INGEST_CHUNK_SIZE: "${MONOLITH_INGEST_CHUNK_SIZE}"
# Time to wait between polling Redis again
MONOLITH_INGEST_ITER_DELAY: "${MONOLITH_INGEST_ITER_DELAY}"
# Number of 4chan threads to request at once
MONOLITH_CH4_THREADS_CONCURRENT: "${MONOLITH_CH4_THREADS_CONCURRENT}"
# Time to wait between every MONOLITH_CH4_THREADS_CONCURRENT threads
MONOLITH_CH4_THREADS_DELAY: "${MONOLITH_CH4_THREADS_DELAY}"
# Time to wait after finishing a crawl before starting again
MONOLITH_CH4_CRAWL_DELAY: "${MONOLITH_CH4_CRAWL_DELAY}"
# Semaphore value
MONOLITH_CH4_THREADS_SEMAPHORE: "${MONOLITH_CH4_THREADS_SEMAPHORE}"
# Threads to use for data processing
# Leave uncommented to use all available threads
MONOLITH_PROCESS_THREADS: "${MONOLITH_PROCESS_THREADS}"
# Enable performance metrics after message processing
MONOLITH_PROCESS_PERFSTATS: "${MONOLITH_PROCESS_PERFSTATS}"
MONOLITH_PROCESS_TARGET_CPU_USAGE: "${MONOLITH_PROCESS_TARGET_CPU_USAGE}"
MONOLITH_CH4_TARGET_CPU_USAGE: "${MONOLITH_CH4_TARGET_CPU_USAGE}"
MONOLITH_CH4_BOARDS: "${MONOLITH_CH4_BOARDS}"
REDIS_PASSWORD: "${REDIS_PASSWORD}"
MONOLITH_INGEST_INCREASE_BELOW: "${MONOLITH_INGEST_INCREASE_BELOW}"
MONOLITH_INGEST_INCREASE_BY: "${MONOLITH_INGEST_INCREASE_BY}"
MONOLITH_INGEST_DECREASE_ABOVE: "${MONOLITH_INGEST_DECREASE_ABOVE}"
MONOLITH_INGEST_DECREASE_BY: "${MONOLITH_INGEST_DECREASE_BY}"
MONOLITH_INGEST_MAX: "${MONOLITH_INGEST_MAX}"
MONOLITH_INGEST_MIN: "${MONOLITH_INGEST_MIN}"
deploy:
resources:
limits:
cpus: '0.5'
memory: 1.0G
network_mode: host
threshold:
image: xf/threshold:latest
container_name: threshold
build: legacy/docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
- ${THRESHOLD_CONFIG_DIR}:/code/legacy/conf/live
#- ${THRESHOLD_TEMPLATE_DIR}:/code/conf/templates
- ${THRESHOLD_CERT_DIR}:/code/legacy/conf/cert
volumes_from:
- tmp
ports:
- "${THRESHOLD_LISTENER_PORT}:${THRESHOLD_LISTENER_PORT}"
- "${THRESHOLD_RELAY_PORT}:${THRESHOLD_RELAY_PORT}"
- "${THRESHOLD_API_PORT}:${THRESHOLD_API_PORT}"
environment:
PORTAINER_GIT_DIR: "${PORTAINER_GIT_DIR}"
MODULES_ENABLED: "${MODULES_ENABLED}"
DISCORD_TOKEN: "${DISCORD_TOKEN}"
THRESHOLD_LISTENER_HOST: "${THRESHOLD_LISTENER_HOST}"
THRESHOLD_LISTENER_PORT: "${THRESHOLD_LISTENER_PORT}"
THRESHOLD_LISTENER_SSL: "${THRESHOLD_LISTENER_SSL}"
THRESHOLD_RELAY_ENABLED: "${THRESHOLD_RELAY_ENABLED}"
THRESHOLD_RELAY_HOST: "${THRESHOLD_RELAY_HOST}"
THRESHOLD_RELAY_PORT: "${THRESHOLD_RELAY_PORT}"
THRESHOLD_RELAY_SSL: "${THRESHOLD_RELAY_SSL}"
THRESHOLD_API_ENABLED: "${THRESHOLD_API_ENABLED}"
THRESHOLD_API_HOST: "${THRESHOLD_API_HOST}"
THRESHOLD_API_PORT: "${THRESHOLD_API_PORT}"
THRESHOLD_CONFIG_DIR: "${THRESHOLD_CONFIG_DIR}"
#THRESHOLD_TEMPLATE_DIR: "${#THRESHOLD_TEMPLATE_DIR}"
THRESHOLD_CERT_DIR: "${THRESHOLD_CERT_DIR}"
# How many messages to ingest at once from Redis
MONOLITH_INGEST_CHUNK_SIZE: "${MONOLITH_INGEST_CHUNK_SIZE}"
# Time to wait between polling Redis again
MONOLITH_INGEST_ITER_DELAY: "${MONOLITH_INGEST_ITER_DELAY}"
# Number of 4chan threads to request at once
MONOLITH_CH4_THREADS_CONCURRENT: "${MONOLITH_CH4_THREADS_CONCURRENT}"
# Time to wait between every MONOLITH_CH4_THREADS_CONCURRENT threads
MONOLITH_CH4_THREADS_DELAY: "${MONOLITH_CH4_THREADS_DELAY}"
# Time to wait after finishing a crawl before starting again
MONOLITH_CH4_CRAWL_DELAY: "${MONOLITH_CH4_CRAWL_DELAY}"
# Semaphore value
MONOLITH_CH4_THREADS_SEMAPHORE: "${MONOLITH_CH4_THREADS_SEMAPHORE}"
# Threads to use for data processing
# Leave uncommented to use all available threads
MONOLITH_PROCESS_THREADS: "${MONOLITH_PROCESS_THREADS}"
# Enable performance metrics after message processing
MONOLITH_PROCESS_PERFSTATS: "${MONOLITH_PROCESS_PERFSTATS}"
MONOLITH_CH4_BOARDS: "${MONOLITH_CH4_BOARDS}"
REDIS_PASSWORD: "${REDIS_PASSWORD}"
# for development
extra_hosts:
- "host.docker.internal:host-gateway"
network_mode: host
ssdb:
image: tsl0922/ssdb
container_name: ssdb_monolith
ports:
- "1289:1289"
environment:
- SSDB_PORT=1289
volumes:
- monolith_ssdb_data:/var/lib/ssdb
# networks:
# - default
# - db
deploy:
resources:
limits:
cpus: '0.5'
memory: 1.0G
network_mode: host
redis:
image: redis
container_name: redis_monolith
command: redis-server /etc/redis.conf
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
volumes:
- ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf
- monolith_redis_data:/data
- type: bind
source: /code/run
target: /var/run
# volumes_from:
# - tmp
healthcheck:
test: "redis-cli ping"
interval: 2s
timeout: 2s
retries: 15
# networks:
# - default
# - xf
# - db
deploy:
resources:
limits:
cpus: '0.5'
memory: 1.0G
network_mode: host
# networks:
# default:
# driver: bridge
# xf:
# external: true
# db:
# external: true
volumes:
monolith_redis_data:
monolith_ssdb_data:

351
docker-compose.yml Normal file
View File

@@ -0,0 +1,351 @@
version: "2.2"
x-superset-image: &superset-image apache/superset:${TAG:-latest-dev}
x-superset-depends-on: &superset-depends-on
- db
- redis_superset
x-superset-volumes: &superset-volumes
# /app/pythonpath_docker will be appended to the PYTHONPATH in the final container
- ./docker/superset:/app/docker
- superset_home:/app/superset_home
services:
app:
image: pathogen/monolith:latest
container_name: monolith
build: ./docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
env_file:
- .env
volumes_from:
- tmp
depends_on:
broker:
condition: service_started
kafka:
condition: service_healthy
tmp:
condition: service_started
redis:
condition: service_healthy
# - db
threshold:
image: pathogen/threshold:latest
container_name: threshold
build: ./legacy/docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
- ${THRESHOLD_CONFIG_DIR}:/code/legacy/conf/live
#- ${THRESHOLD_TEMPLATE_DIR}:/code/conf/templates
- ${THRESHOLD_CERT_DIR}:/code/legacy/conf/cert
ports:
- "${THRESHOLD_LISTENER_PORT}:${THRESHOLD_LISTENER_PORT}"
- "${THRESHOLD_RELAY_PORT}:${THRESHOLD_RELAY_PORT}"
- "${THRESHOLD_API_PORT}:${THRESHOLD_API_PORT}"
env_file:
- .env
# for development
extra_hosts:
- "host.docker.internal:host-gateway"
volumes_from:
- tmp
depends_on:
tmp:
condition: service_started
redis:
condition: service_healthy
# db:
#image: pathogen/manticore:kibana
# image: manticoresearch/manticore:latest
#build:
# context: ./docker/manticore
# args:
# DEV: 1
# restart: always
# turnilo:
# container_name: turnilo
# image: uchhatre/turnilo:latest
# ports:
# - 9093:9090
# environment:
# - DRUID_BROKER_URL=http://broker:8082
# - CONFIG_FILE=/config.yaml
# volumes:
# - ${PORTAINER_GIT_DIR}/docker/turnilo.yaml:/config.yaml
# depends_on:
# - broker
# metabase:
# container_name: metabase
# image: metabase/metabase:latest
# ports:
# - 3096:3000
# environment:
# JAVA_OPTS: -Xmx1g
# MB_DB_TYPE: postgres
# MB_DB_DBNAME: metabase
# MB_DB_PORT: 5432
# MB_DB_USER: druid
# MB_DB_PASS: FoolishPassword
# MB_DB_HOST: postgres
# depends_on:
# - broker
redis_superset:
image: redis:latest
container_name: superset_cache
restart: unless-stopped
volumes:
- redis:/data
db:
env_file: docker/.env-non-dev
image: postgres:10
container_name: superset_db
restart: unless-stopped
volumes:
- db_home:/var/lib/postgresql/data
superset:
env_file: docker/.env-non-dev
image: *superset-image
container_name: superset_app
command: ["/app/docker/docker-bootstrap.sh", "app-gunicorn"]
user: "root"
restart: unless-stopped
ports:
- 8088:8088
depends_on: *superset-depends-on
volumes: *superset-volumes
superset-init:
image: *superset-image
container_name: superset_init
command: ["/app/docker/docker-init.sh"]
env_file: docker/.env-non-dev
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
superset-worker:
image: *superset-image
container_name: superset_worker
command: ["/app/docker/docker-bootstrap.sh", "worker"]
env_file: docker/.env-non-dev
restart: unless-stopped
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
superset-worker-beat:
image: *superset-image
container_name: superset_worker_beat
command: ["/app/docker/docker-bootstrap.sh", "beat"]
env_file: docker/.env-non-dev
restart: unless-stopped
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
postgres:
container_name: postgres
image: postgres:latest
volumes:
- metadata_data:/var/lib/postgresql/data
environment:
POSTGRES_PASSWORD: FoolishPassword
POSTGRES_USER: druid
POSTGRES_DB: druid
# Need 3.5 or later for container nodes
zookeeper:
container_name: zookeeper
image: zookeeper:3.5
ports:
- "2181:2181"
environment:
- ZOO_MY_ID=1
kafka:
image: wurstmeister/kafka:latest
container_name: kafka
depends_on:
- zookeeper
ports:
- 9092:9092
- 29092:29092
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092
KAFKA_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
KAFKA_MESSAGE_MAX_BYTES: 2000000
#KAFKA_HEAP_OPTS: -Xmx2g
healthcheck:
test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "kafka:9092"]
start_period: 15s
interval: 30s
timeout: 30s
retries: 45
coordinator:
image: apache/druid:0.23.0
container_name: coordinator
volumes:
- druid_shared:/opt/shared
- coordinator_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
ports:
- "8081:8081"
command:
- coordinator
env_file:
- environment
broker:
image: apache/druid:0.23.0
container_name: broker
volumes:
- broker_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8082:8082"
command:
- broker
env_file:
- environment
historical:
image: apache/druid:0.23.0
container_name: historical
volumes:
- druid_shared:/opt/shared
- historical_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8083:8083"
command:
- historical
env_file:
- environment
middlemanager:
image: apache/druid:0.23.0
container_name: middlemanager
volumes:
- druid_shared:/opt/shared
- middle_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8091:8091"
- "8100-8105:8100-8105"
command:
- middleManager
env_file:
- environment
router:
image: apache/druid:0.23.0
container_name: router
volumes:
- router_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8888:8888"
command:
- router
env_file:
- environment
# db:
# #image: pathogen/manticore:kibana
# image: manticoresearch/manticore:dev
# #build:
# # context: ./docker/manticore
# # args:
# # DEV: 1
# restart: always
# ports:
# - 9308
# - 9312
# - 9306
# ulimits:
# nproc: 65535
# nofile:
# soft: 65535
# hard: 65535
# memlock:
# soft: -1
# hard: -1
# environment:
# - MCL=1
# volumes:
# - ./docker/data:/var/lib/manticore
# - ./docker/manticore.conf:/etc/manticoresearch/manticore.conf
tmp:
image: busybox
command: chmod -R 777 /var/run/redis
volumes:
- /var/run/redis
redis:
image: redis
command: redis-server /etc/redis.conf
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
volumes:
- ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf
- redis_data:/data
volumes_from:
- tmp
healthcheck:
test: "redis-cli -s /var/run/redis/redis.sock ping"
interval: 2s
timeout: 2s
retries: 15
networks:
default:
external:
name: pathogen
volumes:
superset_home:
external: false
db_home:
external: false
redis_superset:
external: false
redis_data: {}
metadata_data: {}
middle_var: {}
historical_var: {}
broker_var: {}
coordinator_var: {}
router_var: {}
druid_shared: {}

46
docker/.env-non-dev Normal file
View File

@@ -0,0 +1,46 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
COMPOSE_PROJECT_NAME=superset
# database configurations (do not modify)
DATABASE_DB=superset
DATABASE_HOST=db
DATABASE_PASSWORD=superset
DATABASE_USER=superset
# database engine specific environment variables
# change the below if you prefers another database engine
DATABASE_PORT=5432
DATABASE_DIALECT=postgresql
POSTGRES_DB=superset
POSTGRES_USER=superset
POSTGRES_PASSWORD=superset
#MYSQL_DATABASE=superset
#MYSQL_USER=superset
#MYSQL_PASSWORD=superset
#MYSQL_RANDOM_ROOT_PASSWORD=yes
# Add the mapped in /app/pythonpath_docker which allows devs to override stuff
PYTHONPATH=/app/pythonpath:/app/docker/pythonpath_dev
REDIS_HOST=redis
REDIS_PORT=6379
FLASK_ENV=production
SUPERSET_ENV=production
SUPERSET_LOAD_EXAMPLES=yes
CYPRESS_CONFIG=false
SUPERSET_PORT=8088

View File

@@ -1,19 +1,19 @@
# syntax=docker/dockerfile:1
FROM python:3.10
FROM python:3
RUN useradd -d /code xf
RUN useradd -d /code pathogen
RUN mkdir /code
RUN chown xf:xf /code
RUN chown pathogen:pathogen /code
RUN mkdir /venv
RUN chown xf:xf /venv
RUN chown pathogen:pathogen /venv
USER xf
USER pathogen
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
WORKDIR /code
COPY requirements.txt /code/
COPY docker/discord-patched.tgz /code/
COPY discord-patched.tgz /code/
RUN python -m venv /venv
RUN . /venv/bin/activate && pip install -r requirements.txt

View File

@@ -0,0 +1,348 @@
version: "2.2"
# volumes:
# metadata_data: {}
# middle_var: {}
# historical_var: {}
# broker_var: {}
# coordinator_var: {}
# router_var: {}
# druid_shared: {}
x-superset-image: &superset-image apache/superset:${TAG:-latest-dev}
x-superset-depends-on: &superset-depends-on
- db
- redis_superset
x-superset-volumes: &superset-volumes
# /app/pythonpath_docker will be appended to the PYTHONPATH in the final container
- ./docker/superset:/app/docker
- superset_home:/app/superset_home
services:
app:
image: pathogen/monolith:latest
container_name: monolith
build: ./docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
env_file:
- ../stack.env
volumes_from:
- tmp
depends_on:
broker:
condition: service_started
kafka:
condition: service_healthy
tmp:
condition: service_started
redis:
condition: service_healthy
# - db
threshold:
image: pathogen/threshold:latest
container_name: threshold
build: ./legacy/docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
- ${THRESHOLD_CONFIG_DIR}:/code/legacy/conf/live
#- ${THRESHOLD_TEMPLATE_DIR}:/code/conf/templates
- ${THRESHOLD_CERT_DIR}:/code/legacy/conf/cert
ports:
- "${THRESHOLD_LISTENER_PORT}:${THRESHOLD_LISTENER_PORT}"
- "${THRESHOLD_RELAY_PORT}:${THRESHOLD_RELAY_PORT}"
- "${THRESHOLD_API_PORT}:${THRESHOLD_API_PORT}"
env_file:
- ../stack.env
volumes_from:
- tmp
depends_on:
tmp:
condition: service_started
redis:
condition: service_healthy
# db:
#image: pathogen/manticore:kibana
# image: manticoresearch/manticore:latest
#build:
# context: ./docker/manticore
# args:
# DEV: 1
# restart: always
# turnilo:
# container_name: turnilo
# image: uchhatre/turnilo:latest
# ports:
# - 9093:9090
# environment:
# - DRUID_BROKER_URL=http://broker:8082
# depends_on:
# - broker
# metabase:
# container_name: metabase
# image: metabase/metabase:latest
# ports:
# - 3096:3000
# environment:
# JAVA_OPTS: -Xmx1g
# MB_DB_TYPE: postgres
# MB_DB_DBNAME: metabase
# MB_DB_PORT: 5432
# MB_DB_USER: druid
# MB_DB_PASS: FoolishPassword
# MB_DB_HOST: postgres
# depends_on:
# - broker
redis_superset:
image: redis:latest
container_name: superset_cache
restart: unless-stopped
volumes:
- redis:/data
db:
env_file: .env-non-dev
image: postgres:10
container_name: superset_db
restart: unless-stopped
volumes:
- db_home:/var/lib/postgresql/data
superset:
env_file: .env-non-dev
image: *superset-image
container_name: superset_app
command: ["/app/docker/docker-bootstrap.sh", "app-gunicorn"]
user: "root"
restart: unless-stopped
ports:
- 8088:8088
depends_on: *superset-depends-on
volumes: *superset-volumes
superset-init:
image: *superset-image
container_name: superset_init
command: ["/app/docker/docker-init.sh"]
env_file: .env-non-dev
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
superset-worker:
image: *superset-image
container_name: superset_worker
command: ["/app/docker/docker-bootstrap.sh", "worker"]
env_file: .env-non-dev
restart: unless-stopped
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
superset-worker-beat:
image: *superset-image
container_name: superset_worker_beat
command: ["/app/docker/docker-bootstrap.sh", "beat"]
env_file: .env-non-dev
restart: unless-stopped
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
postgres:
container_name: postgres
image: postgres:latest
volumes:
- /block/store/metadata_data:/var/lib/postgresql/data
environment:
- POSTGRES_PASSWORD=FoolishPassword
- POSTGRES_USER=druid
- POSTGRES_DB=druid
# Need 3.5 or later for container nodes
zookeeper:
container_name: zookeeper
image: zookeeper:3.5
ports:
- "2181:2181"
environment:
- ZOO_MY_ID=1
kafka:
image: wurstmeister/kafka:latest
container_name: kafka
depends_on:
- zookeeper
ports:
- 9092:9092
- 29092:29092
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092
KAFKA_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
KAFKA_MESSAGE_MAX_BYTES: 2000000
#KAFKA_HEAP_OPTS: -Xmx2g
healthcheck:
test: ["CMD", "kafka-topics.sh", "--list", "--bootstrap-server", "kafka:9092"]
start_period: 15s
interval: 30s
timeout: 30s
retries: 45
coordinator:
image: apache/druid:0.23.0
container_name: coordinator
volumes:
- /block/store/druid_shared:/opt/shared
- /block/store/coordinator_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
ports:
- "8081:8081"
command:
- coordinator
env_file:
- environment
broker:
image: apache/druid:0.23.0
container_name: broker
volumes:
- /block/store/broker_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8082:8082"
command:
- broker
env_file:
- environment
historical:
image: apache/druid:0.23.0
container_name: historical
volumes:
- /block/store/druid_shared:/opt/shared
- /block/store/historical_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8083:8083"
command:
- historical
env_file:
- environment
middlemanager:
image: apache/druid:0.23.0
container_name: middlemanager
volumes:
- /block/store/druid_shared:/opt/shared
- /block/store/middle_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8091:8091"
- "8100-8105:8100-8105"
command:
- middleManager
env_file:
- environment
router:
image: apache/druid:0.23.0
container_name: router
volumes:
- /block/store/router_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8888:8888"
command:
- router
env_file:
- environment
# db:
# #image: pathogen/manticore:kibana
# image: manticoresearch/manticore:dev
# #build:
# # context: ./docker/manticore
# # args:
# # DEV: 1
# restart: always
# ports:
# - 9308
# - 9312
# - 9306
# ulimits:
# nproc: 65535
# nofile:
# soft: 65535
# hard: 65535
# memlock:
# soft: -1
# hard: -1
# environment:
# - MCL=1
# volumes:
# - ./docker/data:/var/lib/manticore
# - ./docker/manticore.conf:/etc/manticoresearch/manticore.conf
tmp:
image: busybox
command: chmod -R 777 /var/run/redis
volumes:
- /var/run/redis
redis:
image: redis
command: redis-server /etc/redis.conf
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
volumes:
- ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf
- redis_data:/data
volumes_from:
- tmp
healthcheck:
test: "redis-cli -s /var/run/redis/redis.sock ping"
interval: 2s
timeout: 2s
retries: 15
networks:
default:
external:
name: pathogen
volumes:
redis_data: {}
superset_home:
external: false
db_home:
external: false
redis:
external: false

87
docker/environment Normal file
View File

@@ -0,0 +1,87 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Java tuning
#DRUID_XMX=1g
#DRUID_XMS=1g
#DRUID_MAXNEWSIZE=250m
#DRUID_NEWSIZE=250m
#DRUID_MAXDIRECTMEMORYSIZE=1g
#druid_emitter_logging_logLevel=debug
#druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-kafka-indexing-service"]
#druid_zk_service_host=zookeeper
#druid_metadata_storage_host=
#druid_metadata_storage_type=postgresql
#druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
#druid_metadata_storage_connector_user=druid
#druid_metadata_storage_connector_password=FoolishPassword
#druid_coordinator_balancer_strategy=cachingCost
#druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
#druid_indexer_fork_property_druid_processing_buffer_sizeBytes=128MiB
#druid_processing_buffer_sizeBytes=268435456 # 256MiB
#druid_storage_type=local
#druid_storage_storageDirectory=/opt/shared/segments
#druid_indexer_logs_type=file
#druid_indexer_logs_directory=/opt/shared/indexing-logs
#druid_processing_numThreads=1
#druid_processing_numMergeBuffers=1
#DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name="org.apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>
# Java tuning
#DRUID_XMX=1g
#DRUID_XMS=1g
#DRUID_MAXNEWSIZE=250m
#DRUID_NEWSIZE=250m
#DRUID_MAXDIRECTMEMORYSIZE=6172m
DRUID_SINGLE_NODE_CONF=nano-quickstart
druid_emitter_logging_logLevel=debug
druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-kafka-indexing-service"]
druid_zk_service_host=zookeeper
druid_metadata_storage_host=
druid_metadata_storage_type=postgresql
druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
druid_metadata_storage_connector_user=druid
druid_metadata_storage_connector_password=FoolishPassword
druid_coordinator_balancer_strategy=cachingCost
druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB
druid_storage_type=local
druid_storage_storageDirectory=/opt/shared/segments
druid_indexer_logs_type=file
druid_indexer_logs_directory=/opt/shared/indexing-logs
druid_processing_numThreads=2
druid_processing_numMergeBuffers=2
DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name="org.apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>

265
docker/manticore.conf Normal file
View File

@@ -0,0 +1,265 @@
#!/bin/sh
ip=`hostname -i|rev|cut -d\ -f 1|rev`
cat << EOF
searchd {
# https://manual.manticoresearch.com/Server_settings/Searchd#access_plain_attrs
# access_plain_attrs = mmap_preread
# https://manual.manticoresearch.com/Server_settings/Searchd#access_blob_attrs
# access_blob_attrs = mmap_preread
# https://manual.manticoresearch.com/Server_settings/Searchd#access_doclists
# access_doclists = file
# https://manual.manticoresearch.com/Server_settings/Searchd#access_hitlists
# access_hitlists = file
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_connect_timeout
# agent_connect_timeout =
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_query_timeout
# agent_query_timeout =
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_retry_count
# agent_retry_count = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_retry_delay
# agent_retry_delay = 500
# https://manual.manticoresearch.com/Server_settings/Searchd#attr_flush_period
# attr_flush_period = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#binlog_flush
# binlog_flush = 2
# https://manual.manticoresearch.com/Server_settings/Searchd#binlog_max_log_size
# binlog_max_log_size = 268435456
# https://manual.manticoresearch.com/Server_settings/Searchd#binlog_path
# binlog_path =
# https://manual.manticoresearch.com/Server_settings/Searchd#client_timeout
# client_timeout = 300
# https://manual.manticoresearch.com/Server_settings/Searchd#collation_libc_locale
# collation_libc_locale = C
# https://manual.manticoresearch.com/Server_settings/Searchd#collation_server
# collation_server = libc_ci
# https://manual.manticoresearch.com/Server_settings/Searchd#data_dir
data_dir = /var/lib/manticore
# https://manual.manticoresearch.com/Server_settings/Searchd#docstore_cache_size
# docstore_cache_size = 16m
# https://manual.manticoresearch.com/Server_settings/Searchd#expansion_limit
# expansion_limit = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#grouping_in_utc
# grouping_in_utc = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#ha_period_karma
# ha_period_karma = 60
# https://manual.manticoresearch.com/Server_settings/Searchd#ha_ping_interval
# ha_ping_interval = 1000
# https://manual.manticoresearch.com/Server_settings/Searchd#hostname_lookup
# hostname_lookup =
# https://manual.manticoresearch.com/Server_settings/Searchd#jobs_queue_size
# jobs_queue_size =
# https://manual.manticoresearch.com/Server_settings/Searchd#listen_backlog
# listen_backlog = 5
# https://manual.manticoresearch.com/Server_settings/Searchd#listen
# listen_env = this directive allows to append listeners from environment variables
listen = 9306:mysql41
listen = /var/run/mysqld/mysqld.sock:mysql41
listen = $ip:9312
listen = 9308:http
listen = $ip:9315-9325:replication
# https://manual.manticoresearch.com/Server_settings/Searchd#listen_tfo
# listen_tfo = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#log
log = /var/log/manticore/searchd.log
# https://manual.manticoresearch.com/Server_settings/Searchd#max_batch_queries
# max_batch_queries = 32
# https://manual.manticoresearch.com/Server_settings/Searchd#threads
# threads =
# https://manual.manticoresearch.com/Server_settings/Searchd#max_filters
# max_filters = 256
# https://manual.manticoresearch.com/Server_settings/Searchd#max_filter_values
# max_filter_values = 4096
# https://manual.manticoresearch.com/Server_settings/Searchd#max_open_files
# max_open_files = max
# https://manual.manticoresearch.com/Server_settings/Searchd#max_packet_size
max_packet_size = 128M
# https://manual.manticoresearch.com/Server_settings/Searchd#mysql_version_string
# mysql_version_string =
# https://manual.manticoresearch.com/Server_settings/Searchd#net_workers
# net_workers = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#net_wait_tm
# net_wait_tm = -1
# https://manual.manticoresearch.com/Server_settings/Searchd#net_throttle_accept
# net_throttle_accept = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#net_throttle_action
# net_throttle_action = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#node_address
# node_address =
# https://manual.manticoresearch.com/Server_settings/Searchd#ondisk_attrs_default
# ondisk_attrs_default = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#persistent_connections_limit
# persistent_connections_limit =
# https://manual.manticoresearch.com/Server_settings/Searchd#pid_file
pid_file = /var/run/manticore/searchd.pid
# https://manual.manticoresearch.com/Server_settings/Searchd#predicted_time_costs
# predicted_time_costs = doc=64, hit=48, skip=2048, match=64
# https://manual.manticoresearch.com/Server_settings/Searchd#preopen_indexes
# preopen_indexes = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#qcache_max_bytes
qcache_max_bytes = 128Mb
# https://manual.manticoresearch.com/Server_settings/Searchd#qcache_thresh_msec
qcache_thresh_msec = 150
# https://manual.manticoresearch.com/Server_settings/Searchd#qcache_ttl_sec
qcache_ttl_sec = 120
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log_format
query_log_format = sphinxql
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log_min_msec
# query_log_min_msec = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log
# query_log = /var/log/manticore/query.log
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log_mode
# query_log_mode = 600
# https://manual.manticoresearch.com/Server_settings/Searchd#max_connections
# max_connections =
# https://manual.manticoresearch.com/Server_settings/Searchd#network_timeout
# network_timeout = 5
# https://manual.manticoresearch.com/Server_settings/Searchd#read_buffer
# read_buffer = 256K
# https://manual.manticoresearch.com/Server_settings/Searchd#read_buffer_docs
# read_buffer_docs = 256K
# https://manual.manticoresearch.com/Server_settings/Searchd#read_buffer_hits
# read_buffer_hits = 256K
# https://manual.manticoresearch.com/Server_settings/Searchd#read_unhinted
# read_unhinted 32K
# https://manual.manticoresearch.com/Server_settings/Searchd#rt_flush_period
# rt_flush_period =
# https://manual.manticoresearch.com/Server_settings/Searchd#rt_merge_iops
# rt_merge_iops = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#rt_merge_maxiosize
# rt_merge_maxiosize = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#seamless_rotate
# seamless_rotate = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#server_id
# server_id =
# https://manual.manticoresearch.com/Server_settings/Searchd#shutdown_timeout
# shutdown_timeout = 3
# https://manual.manticoresearch.com/Server_settings/Searchd#shutdown_token
# shutdown_token =
# https://manual.manticoresearch.com/Server_settings/Searchd#snippets_file_prefix
# snippets_file_prefix =
# https://manual.manticoresearch.com/Server_settings/Searchd#sphinxql_state
# sphinxql_state =
# https://manual.manticoresearch.com/Server_settings/Searchd#sphinxql_timeout
# sphinxql_timeout = 900
# https://manual.manticoresearch.com/Server_settings/Searchd#ssl_ca
# ssl_ca =
# https://manual.manticoresearch.com/Server_settings/Searchd#ssl_cert
# ssl_cert =
# https://manual.manticoresearch.com/Server_settings/Searchd#ssl_key
# ssl_key =
# https://manual.manticoresearch.com/Server_settings/Searchd#subtree_docs_cache
# subtree_docs_cache = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#subtree_hits_cache
# subtree_hits_cache = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#thread_stack
# thread_stack =
# https://manual.manticoresearch.com/Server_settings/Searchd#unlink_old
# unlink_old = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#watchdog
# watchdog = 1
}
common {
# https://manual.manticoresearch.com/Server_settings/Common#lemmatizer_base
# lemmatizer_base = /usr/local/share
# https://manual.manticoresearch.com/Server_settings/Common#progressive_merge
# progressive_merge =
# https://manual.manticoresearch.com/Server_settings/Common#json_autoconv_keynames
# json_autoconv_keynames =
# https://manual.manticoresearch.com/Server_settings/Common#json_autoconv_numbers
# json_autoconv_numbers = 0
# https://manual.manticoresearch.com/Server_settings/Common#on_json_attr_error
# on_json_attr_error = ignore_attr
# https://manual.manticoresearch.com/Server_settings/Common#plugin_dir
# plugin_dir =
}
# indexer {
# lemmatizer_cache = 1024M
# max_iops = 0
# max_iosize = 0
# mem_limit = 1024M
# }
EOF

View File

@@ -0,0 +1,12 @@
_HiStOrY_V2_
SELECT * FROM films WHERE MATCH('"shark monkey boy robot"/2') AND release_year IN(2006,2007) AND rental_rate BETWEEN 2.0 and 3.0;
SELECT title, HIGHLIGHT({},'description') FROM films WHERE MATCH('"shark monkey boy robot"/2');
SELECT * FROM films WHERE MATCH('" shark monkey boy robot "/2');
SELECT * FROM films WHERE MATCH('Emotional drama') FACET release_year FACET category_id;
SELECT * FROM films WHERE MATCH('Emotional drama') GROUP BY release_year;
SELECT * FROM films WHERE MATCH('Emotional drama -dog -shark');
SELECT * FROM films WHERE MATCH('Emotional drama');
SELECT * FROM films;
DESCRIBE films;
SHOW TABLES;
SOURCE /sandbox.sql

View File

@@ -0,0 +1,76 @@
FROM ubuntu:focal
ARG DEV
ARG DAEMON_URL
ARG MCL_URL
RUN groupadd -r manticore && useradd -r -g manticore manticore
ENV GOSU_VERSION 1.11
ENV MCL_URL=${MCL_URL:-"https://repo.manticoresearch.com/repository/manticoresearch_focal/dists/focal/main/binary-amd64/manticore-columnar-lib_1.15.4-220522-2fef34e_amd64.deb"}
ENV DAEMON_URL=${DAEMON_URL:-"https://repo.manticoresearch.com/repository/manticoresearch_focal/dists/manticore_5.0.2-220530-348514c86_amd64.tgz"}
ENV BETA_URL=${BETA_URL:-"https://repo.manticoresearch.com/repository/kibana_beta/ubuntu/focal.zip"}
RUN set -x \
&& apt-get update && apt-get -y install --no-install-recommends ca-certificates binutils wget gnupg dirmngr unzip && rm -rf /var/lib/apt/lists/* \
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture)" \
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture).asc" \
&& export GNUPGHOME="$(mktemp -d)" \
&& gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
&& { command -v gpgconf > /dev/null && gpgconf --kill all || :; } \
&& rm -rf "$GNUPGHOME" /usr/local/bin/gosu.asc \
&& chmod +x /usr/local/bin/gosu \
&& gosu nobody true && \
if [ "${DEV}" = "1" ]; then \
echo "DEV IS ONE" && \
exit && \
wget https://repo.manticoresearch.com/manticore-dev-repo.noarch.deb \
&& dpkg -i manticore-dev-repo.noarch.deb \
&& apt-key adv --fetch-keys 'https://repo.manticoresearch.com/GPG-KEY-manticore' && apt-get -y update && apt-get -y install manticore \
&& apt-get update \
&& echo $(apt-get -y download --print-uris manticore-columnar-lib | cut -d" " -f1 | cut -d "'" -f 2) > /mcl.url ;\
elif [ "${DEV}" = "2" ]; then \
echo "DEV IS TWO" && \
wget $BETA_URL && unzip focal.zip && rm focal.zip && \
dpkg -i build/* && echo $MCL_URL > /mcl.url; rm build/* ;\
else \
echo "DEV NOT EITHER" && \
exit && \
wget $DAEMON_URL && ARCHIVE_NAME=$(ls | grep '.tgz' | head -n1 ) && tar -xf $ARCHIVE_NAME && rm $ARCHIVE_NAME && \
dpkg -i manticore* && echo $MCL_URL > /mcl.url && rm *.deb ; \
fi \
&& mkdir -p /var/run/manticore && mkdir -p /var/lib/manticore/replication \
&& apt-get update && apt-get -y install libexpat1 libodbc1 libpq5 openssl libcrypto++6 libmysqlclient21 mysql-client \
&& apt-get -y purge --auto-remove \
&& rm -rf /var/lib/apt/lists/* \
&& rm -f /usr/bin/mariabackup /usr/bin/mysqldump /usr/bin/mysqlslap /usr/bin/mysqladmin /usr/bin/mysqlimport \
/usr/bin/mysqlshow /usr/bin/mbstream /usr/bin/mysql_waitpid /usr/bin/innotop /usr/bin/mysqlaccess /usr/bin/mytop \
/usr/bin/mysqlreport /usr/bin/mysqldumpslow /usr/bin/mysql_find_rows /usr/bin/mysql_fix_extensions \
/usr/bin/mysql_embedded /usr/bin/mysqlcheck \
&& rm -f /usr/bin/spelldump /usr/bin/wordbreaker \
&& mkdir -p /var/run/mysqld/ && chown manticore:manticore /var/run/mysqld/ \
&& echo "\n[mysql]\nsilent\nwait\ntable\n" >> /etc/mysql/my.cnf && \
wget -P /tmp https://repo.manticoresearch.com/repository/morphology/en.pak.tgz && \
wget -P /tmp https://repo.manticoresearch.com/repository/morphology/de.pak.tgz && \
wget -P /tmp https://repo.manticoresearch.com/repository/morphology/ru.pak.tgz && \
tar -xf /tmp/en.pak.tgz -C /usr/share/manticore/ && \
tar -xf /tmp/de.pak.tgz -C /usr/share/manticore/ && \
tar -xf /tmp/ru.pak.tgz -C /usr/share/manticore/
COPY manticore.conf /etc/manticoresearch/
COPY sandbox.sql /sandbox.sql
COPY .mysql_history /root/.mysql_history
COPY docker-entrypoint.sh /usr/local/bin/
RUN ln -s usr/local/bin/docker-entrypoint.sh /entrypoint.sh
WORKDIR /var/lib/manticore
ENTRYPOINT ["docker-entrypoint.sh"]
EXPOSE 9306
EXPOSE 9308
EXPOSE 9312
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
CMD ["sh", "-c", "(echo 'START WAIT' && sleep 5 && echo 'END WAIT' && mysql -P9306 -h0 -e 'set global log_management = 0; set global log_management = 1;') & searchd --nodetach"]

278
docker/manticore/README.md Normal file
View File

@@ -0,0 +1,278 @@
# Manticore Search Docker image
This is the git repo of official [Docker image](https://hub.docker.com/r/manticoresearch/manticore/) for [Manticore Search](https://github.com/manticoresoftware/manticoresearch).
Manticore Search is an easy to use open source fast database for search. It helps thousands of companies from small to large, such as Craigslist, to search and filter petabytes of text data on a single or hundreds of nodes, do stream full-text filtering, add auto-complete, spell correction, more-like-this, faceting and other search-related technologies to their websites and applications.
The default configuration includes a sample Real-Time index and listens on the default ports:
* `9306` for connections from a MySQL client
* `9308` for connections via HTTP
* `9312` for connections via a binary protocol (e.g. in case you run a cluster)
The image comes with libraries for easy indexing data from MySQL, PostgreSQL XML and CSV files.
# How to run Manticore Search Docker image
## Quick usage
The below is the simplest way to start Manticore in a container and log in to it via mysql client:
```bash
docker run --name manticore --rm -d manticoresearch/manticore && sleep 3 && docker exec -it manticore mysql && docker stop manticore
```
When you exit from the mysql client it stops and removes the container, so **use it only for testing / sandboxing purposes**. See below how to use it in production.
The image comes with a sample index which can be loaded like this:
```mysql
mysql> source /sandbox.sql
```
Also the mysql client has in history several sample queries that you can run on the above index, just use Up/Down keys in the client to see and run them.
## Production use
### Ports and mounting points
For data persistence `/var/lib/manticore/` should be mounted to local storage or other desired storage engine.
```bash
docker run --name manticore -v $(pwd)/data:/var/lib/manticore -p 127.0.0.1:9306:9306 -p 127.0.0.1:9308:9308 -d manticoresearch/manticore
```
Configuration file inside the instance is located at `/etc/manticoresearch/manticore.conf`. For custom settings, this file should be mounted to your own configuration file.
The ports are 9306/9308/9312 for SQL/HTTP/Binary, expose them depending on how you are going to use Manticore. For example:
```bash
docker run --name manticore -v $(pwd)/manticore.conf:/etc/manticoresearch/manticore.conf -v $(pwd)/data:/var/lib/manticore/ -p 127.0.0.1:9306:9306 -p 127.0.0.1:9308:9308 -d manticoresearch/manticore
```
Make sure to remove `127.0.0.1:` if you want the ports to be available for external hosts.
### Manticore Columnar Library
The docker image doesn't include [Manticore Columnar Library](https://github.com/manticoresoftware/columnar) which has to be used if you need:
* columnar storage
* secondary indexes
but you can easily enable it in runtime by using environment variable `MCL=1`, i.e. `docker run -e MCL=1 ... manticoresearch/manticore`. It will then download and install the library and put it to the data dir (which is normally mapped as a volume in production). Next time you run the container the library will be already there, hence it won't be downloaded again unless you change the Manticore Search version.
### Docker-compose
In many cases you might want to use Manticore together with other images specified in a docker-compose YAML file. Here is the minimal recommended specification for Manticore Search in docker-compose.yml:
```yaml
version: '2.2'
services:
manticore:
container_name: manticore
image: manticoresearch/manticore
restart: always
ports:
- 127.0.0.1:9306:9306
- 127.0.0.1:9308:9308
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
memlock:
soft: -1
hard: -1
environment:
- MCL=1
volumes:
- ./data:/var/lib/manticore
# - ./manticore.conf:/etc/manticoresearch/manticore.conf # uncommment if you use a custom config
```
Besides using the exposed ports 9306 and 9308 you can log into the instance by running `docker-compose exec manticore mysql`.
### HTTP protocol
Manticore is accessible via HTTP on ports 9308 and 9312. You can map either of them locally and connect with curl:
```bash
docker run --name manticore -p 9308:9308 -d manticoresearch/manticore
```
Create a table:
```bash
curl -X POST 'http://127.0.0.1:9308/sql' -d 'mode=raw&query=CREATE TABLE testrt ( title text, content text, gid integer)'
```
Insert a document:
```bash
curl -X POST 'http://127.0.0.1:9308/json/insert' -d'{"index":"testrt","id":1,"doc":{"title":"Hello","content":"world","gid":1}}'
```
Perform a simple search:
```bash
curl -X POST 'http://127.0.0.1:9308/json/search' -d '{"index":"testrt","query":{"match":{"*":"hello world"}}}'
```
### Logging
By default, Manticore logs to `/dev/stdout`, so you can watch the log on the host with:
```bash
docker logs manticore
```
If you want to get log of your queries the same way you can do it by passing environment variable `QUERY_LOG_TO_STDOUT=true`.
### Multi-node cluster with replication
Here is a simple `docker-compose.yml` for defining a two node cluster:
```yaml
version: '2.2'
services:
manticore-1:
image: manticoresearch/manticore
restart: always
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
memlock:
soft: -1
hard: -1
environment:
- MCL=1
networks:
- manticore
manticore-2:
image: manticoresearch/manticore
restart: always
ulimits:
nproc: 65535
nofile:
soft: 65535
hard: 65535
memlock:
soft: -1
hard: -1
environment:
- MCL=1
networks:
- manticore
networks:
manticore:
driver: bridge
```
* Start it: `docker-compose up`
* Create a cluster with a table:
```mysql
$ docker-compose exec manticore-1 mysql
mysql> CREATE TABLE testrt ( title text, content text, gid integer);
mysql> CREATE CLUSTER posts;
Query OK, 0 rows affected (0.24 sec)
mysql> ALTER CLUSTER posts ADD testrt;
Query OK, 0 rows affected (0.07 sec)
MySQL [(none)]> exit
Bye
```
* Join to the the cluster on the 2nd instance and insert smth to the table:
```mysql
$ docker-compose exec manticore-2 mysql
mysql> JOIN CLUSTER posts AT 'manticore-1:9312';
mysql> INSERT INTO posts:testrt(title,content,gid) VALUES('hello','world',1);
Query OK, 1 row affected (0.00 sec)
MySQL [(none)]> exit
Bye
```
* If you now go back to the first instance you'll see the new record:
```mysql
$ docker-compose exec manticore-1 mysql
MySQL [(none)]> select * from testrt;
+---------------------+------+-------+---------+
| id | gid | title | content |
+---------------------+------+-------+---------+
| 3891565839006040065 | 1 | hello | world |
+---------------------+------+-------+---------+
1 row in set (0.00 sec)
MySQL [(none)]> exit
Bye
```
## Memory locking and limits
It's recommended to overwrite the default ulimits of docker for the Manticore instance:
```bash
--ulimit nofile=65536:65536
```
For the best performance, Manticore tables' components can be locked into memory. When Manticore is run under Docker, the instance requires additional privileges to allow memory locking. The following options must be added when running the instance:
```bash
--cap-add=IPC_LOCK --ulimit memlock=-1:-1
```
## Configuring Manticore Search with Docker
If you want to run Manticore with your custom config containing indexes definition you will need to mount the configuration to the instance:
```bash
docker run --name manticore -v $(pwd)/manticore.conf:/etc/manticoresearch/manticore.conf -v $(pwd)/data/:/var/lib/manticore -p 127.0.0.1:9306:9306 -d manticoresearch/manticore
```
Take into account that Manticore search inside the container is run under user `manticore`. Performing operations with tables (like creating or rotating plain indexes) should be also done under `manticore`. Otherwise the files will be created under `root` and the search daemon won't have rights to open them. For example here is how you can rotate all plain indexes:
```bash
docker exec -it manticore gosu manticore indexer --all --rotate
```
### Environment variables
You can also set individual `searchd` and `common` configuration settings using Docker environment variables.
The settings must be prefixed with their section name, for example to change value of setting `mysql_version_string` in section `searchd` the variable must be named `searchd_mysql_version_string`:
```bash
docker run --name manticore -p 127.0.0.1:9306:9306 -e searchd_mysql_version_string='5.5.0' -d manticoresearch/manticore
```
In case of `listen` directive, you can pass using Docker variable `searchd_listen` new listening interfaces in addition to the default ones. Multiple interfaces can be declared separated by semi-colon ("|").
For listening only on network address, the `$ip` (retrieved internally from `hostname -i`) can be used as address alias.
For example `-e searchd_listen='9316:http|9307:mysql|$ip:5443:mysql_vip'` will add an additional SQL interface on port 9307, a SQL VIP on 5443 running only on the instance IP and HTTP on port 9316, beside the defaults on 9306 and 9308, respectively.
```bash
$ docker run --rm -p 1188:9307 -e searchd_mysql_version_string='5.5.0' -e searchd_listen='9316:http|9307:mysql|$ip:5443:mysql_vip' manticore
[Mon Aug 17 07:31:58.719 2020] [1] using config file '/etc/manticoresearch/manticore.conf' (9130 chars)...
listening on all interfaces for http, port=9316
listening on all interfaces for mysql, port=9307
listening on 172.17.0.17:5443 for VIP mysql
listening on all interfaces for mysql, port=9306
listening on UNIX socket /var/run/mysqld/mysqld.sock
listening on 172.17.0.17:9312 for sphinx
listening on all interfaces for http, port=9308
prereading 0 indexes
prereaded 0 indexes in 0.000 sec
accepting connections
```
# Issues
For reporting issues, please use the [issue tracker](https://github.com/manticoresoftware/docker/issues).

View File

@@ -0,0 +1,118 @@
#!/bin/bash
set -eo pipefail
echo "RUNNING ENTRYPOINT"
# check to see if this file is being run or sourced from another script
_is_sourced() {
# https://unix.stackexchange.com/a/215279
[ "${#FUNCNAME[@]}" -ge 2 ] &&
[ "${FUNCNAME[0]}" = '_is_sourced' ] &&
[ "${FUNCNAME[1]}" = 'source' ]
}
_searchd_want_help() {
local arg
for arg; do
case "$arg" in
-'?' | --help | -h | -v)
return 0
;;
esac
done
return 1
}
docker_setup_env() {
if [ -n "$QUERY_LOG_TO_STDOUT" ]; then
export searchd_query_log=/var/log/manticore/query.log
[ ! -f /var/log/manticore/query.log ] && ln -sf /dev/stdout /var/log/manticore/query.log
fi
if [[ "${MCL}" == "1" ]]; then
LIB_MANTICORE_COLUMNAR="/var/lib/manticore/.mcl/lib_manticore_columnar.so"
LIB_MANTICORE_SECONDARY="/var/lib/manticore/.mcl/lib_manticore_secondary.so"
[ -L /usr/share/manticore/modules/lib_manticore_columnar.so ] || ln -s $LIB_MANTICORE_COLUMNAR /usr/share/manticore/modules/lib_manticore_columnar.so
[ -L /usr/share/manticore/modules/lib_manticore_secondary.so ] || ln -s $LIB_MANTICORE_SECONDARY /usr/share/manticore/modules/lib_manticore_secondary.so
searchd -v|grep -i error|egrep "trying to load" \
&& rm $LIB_MANTICORE_COLUMNAR $LIB_MANTICORE_SECONDARY \
&& echo "WARNING: wrong MCL version was removed, installing the correct one"
if [[ ! -f "$LIB_MANTICORE_COLUMNAR" || ! -f "$LIB_MANTICORE_SECONDARY" ]]; then
if ! mkdir -p /var/lib/manticore/.mcl/ ; then
echo "ERROR: Manticore Columnar Library is inaccessible: couldn't create /var/lib/manticore/.mcl/."
exit
fi
MCL_URL=$(cat /mcl.url)
wget -P /tmp $MCL_URL
LAST_PATH=$(pwd)
cd /tmp
PACKAGE_NAME=$(ls | grep manticore-columnar | head -n 1)
ar -x $PACKAGE_NAME
tar -xf data.tar.gz
find . -name '*.so' -exec cp {} /var/lib/manticore/.mcl/ \;
cd $LAST_PATH
fi
fi
}
_main() {
# first arg is `h` or some `--option`
if [ "${1#-}" != "$1" ]; then
set -- searchd "$@"
fi
# Amended from searchd to sh since we're using sh to wait until searchd starts, then set the Kibana-specific options
if [ "$1" = 'sh' ] && ! _searchd_want_help "@"; then
docker_setup_env "$@"
# allow the container to be started with `--user`
if [ "$(id -u)" = '0' ]; then
find /var/lib/manticore /var/log/manticore /var/run/manticore /etc/manticoresearch \! -user manticore -exec chown manticore '{}' +
exec gosu manticore "$0" "$@"
fi
fi
_replace_conf_from_env
exec "$@"
}
_replace_conf_from_env() {
sed_query=""
while IFS='=' read -r oldname value; do
if [[ $oldname == 'searchd_'* || $oldname == 'common_'* ]]; then
value=$(echo ${!oldname} | sed 's/\//\\\//g')
oldname=$(echo $oldname | sed "s/searchd_//g;s/common_//g;")
newname=$oldname
if [[ $newname == 'listen' ]]; then
oldname="listen_env"
IFS='|' read -ra ADDR <<<"$value"
count=0
for i in "${ADDR[@]}"; do
if [[ $count == 0 ]]; then
value=$i
else
value="$value\n listen = $i"
fi
count=$((count + 1))
done
fi
if [[ -z $sed_query ]]; then
sed_query="s/(#\s)*?$oldname\s?=\s?.*?$/$newname = $value/g"
else
sed_query="$sed_query;s/(#\s)*?$oldname\s?=\s?.*?$/$newname = $value/g"
fi
fi
done < <(env)
if [[ ! -z $sed_query ]]; then
sed -i -E "$sed_query" /etc/manticoresearch/manticore.conf
fi
}
# If we are sourced from elsewhere, don't perform any further actions
if ! _is_sourced; then
_main "$@"
fi

View File

@@ -0,0 +1,259 @@
#!/bin/sh
ip=`hostname -i|rev|cut -d\ -f 1|rev`
cat << EOF
searchd {
# https://manual.manticoresearch.com/Server_settings/Searchd#access_plain_attrs
# access_plain_attrs = mmap_preread
# https://manual.manticoresearch.com/Server_settings/Searchd#access_blob_attrs
# access_blob_attrs = mmap_preread
# https://manual.manticoresearch.com/Server_settings/Searchd#access_doclists
# access_doclists = file
# https://manual.manticoresearch.com/Server_settings/Searchd#access_hitlists
# access_hitlists = file
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_connect_timeout
# agent_connect_timeout =
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_query_timeout
# agent_query_timeout =
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_retry_count
# agent_retry_count = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#agent_retry_delay
# agent_retry_delay = 500
# https://manual.manticoresearch.com/Server_settings/Searchd#attr_flush_period
# attr_flush_period = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#binlog_flush
# binlog_flush = 2
# https://manual.manticoresearch.com/Server_settings/Searchd#binlog_max_log_size
# binlog_max_log_size = 268435456
# https://manual.manticoresearch.com/Server_settings/Searchd#binlog_path
# binlog_path =
# https://manual.manticoresearch.com/Server_settings/Searchd#client_timeout
# client_timeout = 300
# https://manual.manticoresearch.com/Server_settings/Searchd#collation_libc_locale
# collation_libc_locale = C
# https://manual.manticoresearch.com/Server_settings/Searchd#collation_server
# collation_server = libc_ci
# https://manual.manticoresearch.com/Server_settings/Searchd#data_dir
data_dir = /var/lib/manticore
# https://manual.manticoresearch.com/Server_settings/Searchd#docstore_cache_size
# docstore_cache_size = 16m
# https://manual.manticoresearch.com/Server_settings/Searchd#expansion_limit
# expansion_limit = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#grouping_in_utc
# grouping_in_utc = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#ha_period_karma
# ha_period_karma = 60
# https://manual.manticoresearch.com/Server_settings/Searchd#ha_ping_interval
# ha_ping_interval = 1000
# https://manual.manticoresearch.com/Server_settings/Searchd#hostname_lookup
# hostname_lookup =
# https://manual.manticoresearch.com/Server_settings/Searchd#jobs_queue_size
# jobs_queue_size =
# https://manual.manticoresearch.com/Server_settings/Searchd#listen_backlog
# listen_backlog = 5
# https://manual.manticoresearch.com/Server_settings/Searchd#listen
# listen_env = this directive allows to append listeners from environment variables
listen = 9306:mysql41
listen = /var/run/mysqld/mysqld.sock:mysql41
listen = $ip:9312
listen = 9308:http
listen = $ip:9315-9325:replication
# https://manual.manticoresearch.com/Server_settings/Searchd#listen_tfo
# listen_tfo = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#log
log = /var/log/manticore/searchd.log
# https://manual.manticoresearch.com/Server_settings/Searchd#max_batch_queries
# max_batch_queries = 32
# https://manual.manticoresearch.com/Server_settings/Searchd#threads
# threads =
# https://manual.manticoresearch.com/Server_settings/Searchd#max_filters
# max_filters = 256
# https://manual.manticoresearch.com/Server_settings/Searchd#max_filter_values
# max_filter_values = 4096
# https://manual.manticoresearch.com/Server_settings/Searchd#max_open_files
# max_open_files =
# https://manual.manticoresearch.com/Server_settings/Searchd#max_packet_size
max_packet_size = 128M
# https://manual.manticoresearch.com/Server_settings/Searchd#mysql_version_string
# mysql_version_string =
# https://manual.manticoresearch.com/Server_settings/Searchd#net_workers
# net_workers = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#net_wait_tm
# net_wait_tm = -1
# https://manual.manticoresearch.com/Server_settings/Searchd#net_throttle_accept
# net_throttle_accept = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#net_throttle_action
# net_throttle_action = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#node_address
# node_address =
# https://manual.manticoresearch.com/Server_settings/Searchd#ondisk_attrs_default
# ondisk_attrs_default = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#persistent_connections_limit
# persistent_connections_limit =
# https://manual.manticoresearch.com/Server_settings/Searchd#pid_file
pid_file = /var/run/manticore/searchd.pid
# https://manual.manticoresearch.com/Server_settings/Searchd#predicted_time_costs
# predicted_time_costs = doc=64, hit=48, skip=2048, match=64
# https://manual.manticoresearch.com/Server_settings/Searchd#preopen_indexes
# preopen_indexes = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#qcache_max_bytes
# qcache_max_bytes = 16Mb
# https://manual.manticoresearch.com/Server_settings/Searchd#qcache_thresh_msec
# qcache_thresh_msec = 3000
# https://manual.manticoresearch.com/Server_settings/Searchd#qcache_ttl_sec
# qcache_ttl_sec = 60
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log_format
query_log_format = sphinxql
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log_min_msec
query_log_min_msec = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log
query_log = /var/log/manticore/query.log
# https://manual.manticoresearch.com/Server_settings/Searchd#query_log_mode
# query_log_mode = 600
# https://manual.manticoresearch.com/Server_settings/Searchd#max_connections
# max_connections =
# https://manual.manticoresearch.com/Server_settings/Searchd#network_timeout
# network_timeout = 5
# https://manual.manticoresearch.com/Server_settings/Searchd#read_buffer
# read_buffer = 256K
# https://manual.manticoresearch.com/Server_settings/Searchd#read_buffer_docs
# read_buffer_docs = 256K
# https://manual.manticoresearch.com/Server_settings/Searchd#read_buffer_hits
# read_buffer_hits = 256K
# https://manual.manticoresearch.com/Server_settings/Searchd#read_unhinted
# read_unhinted 32K
# https://manual.manticoresearch.com/Server_settings/Searchd#rt_flush_period
# rt_flush_period =
# https://manual.manticoresearch.com/Server_settings/Searchd#rt_merge_iops
# rt_merge_iops = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#rt_merge_maxiosize
# rt_merge_maxiosize = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#seamless_rotate
# seamless_rotate = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#server_id
# server_id =
# https://manual.manticoresearch.com/Server_settings/Searchd#shutdown_timeout
# shutdown_timeout = 3
# https://manual.manticoresearch.com/Server_settings/Searchd#shutdown_token
# shutdown_token =
# https://manual.manticoresearch.com/Server_settings/Searchd#snippets_file_prefix
# snippets_file_prefix =
# https://manual.manticoresearch.com/Server_settings/Searchd#sphinxql_state
# sphinxql_state =
# https://manual.manticoresearch.com/Server_settings/Searchd#sphinxql_timeout
# sphinxql_timeout = 900
# https://manual.manticoresearch.com/Server_settings/Searchd#ssl_ca
# ssl_ca =
# https://manual.manticoresearch.com/Server_settings/Searchd#ssl_cert
# ssl_cert =
# https://manual.manticoresearch.com/Server_settings/Searchd#ssl_key
# ssl_key =
# https://manual.manticoresearch.com/Server_settings/Searchd#subtree_docs_cache
# subtree_docs_cache = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#subtree_hits_cache
# subtree_hits_cache = 0
# https://manual.manticoresearch.com/Server_settings/Searchd#thread_stack
# thread_stack =
# https://manual.manticoresearch.com/Server_settings/Searchd#unlink_old
# unlink_old = 1
# https://manual.manticoresearch.com/Server_settings/Searchd#watchdog
# watchdog = 1
}
common {
# https://manual.manticoresearch.com/Server_settings/Common#lemmatizer_base
# lemmatizer_base = /usr/local/share
# https://manual.manticoresearch.com/Server_settings/Common#progressive_merge
# progressive_merge =
# https://manual.manticoresearch.com/Server_settings/Common#json_autoconv_keynames
# json_autoconv_keynames =
# https://manual.manticoresearch.com/Server_settings/Common#json_autoconv_numbers
# json_autoconv_numbers = 0
# https://manual.manticoresearch.com/Server_settings/Common#on_json_attr_error
# on_json_attr_error = ignore_attr
# https://manual.manticoresearch.com/Server_settings/Common#plugin_dir
# plugin_dir =
}
EOF

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,2 @@
unixsocket /var/run/monolith-redis.sock
unixsocket /var/run/redis/redis.sock
unixsocketperm 777
port 0
# port 6379
# requirepass changeme

23
docker/requirements.txt Normal file
View File

@@ -0,0 +1,23 @@
wheel
beautifulsoup4
redis
siphashc
aiohttp[speedups]
python-dotenv
#manticoresearch
numpy
aioredis[hiredis]
aiokafka
vaderSentiment
polyglot
pyicu
pycld2
morfessor
six
nltk
#spacy
gensim
python-Levenshtein
orjson
uvloop
numba

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -eo pipefail
REQUIREMENTS_LOCAL="/app/docker/requirements-local.txt"
# If Cypress run overwrite the password for admin and export env variables
if [ "$CYPRESS_CONFIG" == "true" ]; then
export SUPERSET_CONFIG=tests.integration_tests.superset_test_config
export SUPERSET_TESTENV=true
export SUPERSET__SQLALCHEMY_DATABASE_URI=postgresql+psycopg2://superset:superset@db:5432/superset
fi
#
# Make sure we have dev requirements installed
#
if [ -f "${REQUIREMENTS_LOCAL}" ]; then
echo "Installing local overrides at ${REQUIREMENTS_LOCAL}"
pip install -r "${REQUIREMENTS_LOCAL}"
else
echo "Skipping local overrides"
fi
if [[ "${1}" == "worker" ]]; then
echo "Starting Celery worker..."
celery --app=superset.tasks.celery_app:app worker -Ofair -l INFO
elif [[ "${1}" == "beat" ]]; then
echo "Starting Celery beat..."
celery --app=superset.tasks.celery_app:app beat --pidfile /tmp/celerybeat.pid -l INFO -s "${SUPERSET_HOME}"/celerybeat-schedule
elif [[ "${1}" == "app" ]]; then
echo "Starting web app..."
flask run -p 8088 --with-threads --reload --debugger --host=0.0.0.0
elif [[ "${1}" == "app-gunicorn" ]]; then
echo "Starting web app..."
/usr/bin/run-server.sh
fi

78
docker/superset/docker-init.sh Executable file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -e
#
# Always install local overrides first
#
/app/docker/docker-bootstrap.sh
STEP_CNT=4
echo_step() {
cat <<EOF
######################################################################
Init Step ${1}/${STEP_CNT} [${2}] -- ${3}
######################################################################
EOF
}
ADMIN_PASSWORD="admin"
# If Cypress run overwrite the password for admin and export env variables
if [ "$CYPRESS_CONFIG" == "true" ]; then
ADMIN_PASSWORD="general"
export SUPERSET_CONFIG=tests.integration_tests.superset_test_config
export SUPERSET_TESTENV=true
export SUPERSET__SQLALCHEMY_DATABASE_URI=postgresql+psycopg2://superset:superset@db:5432/superset
fi
# Initialize the database
echo_step "1" "Starting" "Applying DB migrations"
superset db upgrade
echo_step "1" "Complete" "Applying DB migrations"
# Create an admin user
echo_step "2" "Starting" "Setting up admin user ( admin / $ADMIN_PASSWORD )"
superset fab create-admin \
--username admin \
--firstname Superset \
--lastname Admin \
--email admin@superset.com \
--password $ADMIN_PASSWORD
echo_step "2" "Complete" "Setting up admin user"
# Create default roles and permissions
echo_step "3" "Starting" "Setting up roles and perms"
superset init
echo_step "3" "Complete" "Setting up roles and perms"
if [ "$SUPERSET_LOAD_EXAMPLES" = "yes" ]; then
# Load some data to play with
echo_step "4" "Starting" "Loading examples"
# If Cypress run which consumes superset_test_config load required data for tests
if [ "$CYPRESS_CONFIG" == "true" ]; then
superset load_test_users
superset load_examples --load-test-data
else
superset load_examples
fi
echo_step "4" "Complete" "Loading examples"
fi

View File

@@ -0,0 +1 @@
pydruid

View File

@@ -1,6 +1,6 @@
PORTAINER_GIT_DIR=..
PORTAINER_GIT_DIR=.
MODULES_ENABLED="dis"
DISCORD_TOKEN=
DISCORD_TOKEN="xx"
THRESHOLD_LISTENER_HOST=0.0.0.0
THRESHOLD_LISTENER_PORT=13867
THRESHOLD_LISTENER_SSL=1
@@ -13,16 +13,16 @@ THRESHOLD_RELAY_SSL=1
THRESHOLD_API_ENABLED=1
THRESHOLD_API_HOST=0.0.0.0
THRESHOLD_API_PORT=13869
PORTAINER_GIT_DIR=.
THRESHOLD_CONFIG_DIR=../legacy/conf/live/
#THRESHOLD_TEMPLATE_DIR=../legacy/conf/templates/
THRESHOLD_CERT_DIR=../legacy/conf/cert/
THRESHOLD_CONFIG_DIR=./legacy/conf/live/
THRESHOLD_CERT_DIR=./legacy/conf/cert/
# How many messages to ingest at once from Redis
MONOLITH_INGEST_CHUNK_SIZE=70000
MONOLITH_INGEST_CHUNK_SIZE=900
# Time to wait between polling Redis again
MONOLITH_INGEST_ITER_DELAY=2
MONOLITH_INGEST_ITER_DELAY=0.5
# Number of 4chan threads to request at once
MONOLITH_CH4_THREADS_CONCURRENT=1000
@@ -31,20 +31,11 @@ MONOLITH_CH4_THREADS_CONCURRENT=1000
MONOLITH_CH4_THREADS_DELAY=0.1
# Time to wait after finishing a crawl before starting again
MONOLITH_CH4_CRAWL_DELAY=60
MONOLITH_CH4_CRAWL_DELAY=30
# Semaphore value
MONOLITH_CH4_THREADS_SEMAPHORE=1000
# Threads to use for data processing
# Leave uncommented to use all available threads
MONOLITH_PROCESS_THREADS=7
# Enable performance metrics after message processing
MONOLITH_PROCESS_PERFSTATS=0
# Elasticsearch
ELASTICSEARCH_USERNAME=elastic
ELASTICSEARCH_PASSWORD=
ELASTICSEARCH_HOST=https://es01:9200
ELASTICSEARCH_TLS=1
# MONOLITH_PROCESS_THREADS=4

87
environment Normal file
View File

@@ -0,0 +1,87 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Java tuning
#DRUID_XMX=1g
#DRUID_XMS=1g
#DRUID_MAXNEWSIZE=250m
#DRUID_NEWSIZE=250m
#DRUID_MAXDIRECTMEMORYSIZE=1g
#druid_emitter_logging_logLevel=debug
#druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-kafka-indexing-service"]
#druid_zk_service_host=zookeeper
#druid_metadata_storage_host=
#druid_metadata_storage_type=postgresql
#druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
#druid_metadata_storage_connector_user=druid
#druid_metadata_storage_connector_password=FoolishPassword
#druid_coordinator_balancer_strategy=cachingCost
#druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
#druid_indexer_fork_property_druid_processing_buffer_sizeBytes=128MiB
#druid_processing_buffer_sizeBytes=268435456 # 256MiB
#druid_storage_type=local
#druid_storage_storageDirectory=/opt/shared/segments
#druid_indexer_logs_type=file
#druid_indexer_logs_directory=/opt/shared/indexing-logs
#druid_processing_numThreads=1
#druid_processing_numMergeBuffers=1
#DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name="org.apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>
# Java tuning
#DRUID_XMX=1g
#DRUID_XMS=1g
#DRUID_MAXNEWSIZE=250m
#DRUID_NEWSIZE=250m
#DRUID_MAXDIRECTMEMORYSIZE=6172m
DRUID_SINGLE_NODE_CONF=nano-quickstart
druid_emitter_logging_logLevel=debug
druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-kafka-indexing-service"]
druid_zk_service_host=zookeeper
druid_metadata_storage_host=
druid_metadata_storage_type=postgresql
druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
druid_metadata_storage_connector_user=druid
druid_metadata_storage_connector_password=FoolishPassword
druid_coordinator_balancer_strategy=cachingCost
druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB
druid_storage_type=local
druid_storage_storageDirectory=/opt/shared/segments
druid_indexer_logs_type=file
druid_indexer_logs_directory=/opt/shared/indexing-logs
druid_processing_numThreads=2
druid_processing_numMergeBuffers=2
DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name="org.apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>

View File

@@ -0,0 +1,17 @@
repos:
- repo: https://github.com/psf/black
rev: 22.6.0
hooks:
- id: black
args:
- --line-length=120
- repo: https://github.com/PyCQA/isort
rev: 5.10.1
hooks:
- id: isort
args: ["--profile", "black"]
- repo: https://github.com/PyCQA/flake8
rev: 4.0.1
hooks:
- id: flake8
args: [--max-line-length=120]

View File

@@ -17,7 +17,7 @@
},
"Key": "key.pem",
"Certificate": "cert.pem",
"RedisSocket": "/var/run/socks/redis.sock",
"RedisSocket": "/var/run/redis/redis.sock",
"RedisDBEphemeral": 1,
"RedisDBPersistent": 0,
"UsePassword": false,

41
legacy/docker-compose.yml Normal file
View File

@@ -0,0 +1,41 @@
version: "2"
services:
app:
image: pathogen/threshold:latest
build: ./docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
- ${THRESHOLD_CONFIG_DIR}:/code/conf/live
#- ${THRESHOLD_TEMPLATE_DIR}:/code/conf/templates
- ${THRESHOLD_CERT_DIR}:/code/conf/cert
ports:
- "${THRESHOLD_LISTENER_PORT}:${THRESHOLD_LISTENER_PORT}"
- "${THRESHOLD_RELAY_PORT}:${THRESHOLD_RELAY_PORT}"
- "${THRESHOLD_API_PORT}:${THRESHOLD_API_PORT}"
env_file:
- .env
# for development
extra_hosts:
- "host.docker.internal:host-gateway"
volumes_from:
- tmp
tmp:
image: busybox
command: chmod -R 777 /var/run/redis
volumes:
- /var/run/redis
redis:
image: redis
command: redis-server /etc/redis.conf
volumes:
- ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf
volumes_from:
- tmp
networks:
default:
external:
name: pathogen

View File

@@ -0,0 +1,38 @@
version: "2"
services:
app:
image: pathogen/threshold:latest
build: ./docker
volumes:
- ${PORTAINER_GIT_DIR}:/code
- ${THRESHOLD_CONFIG_DIR}:/code/conf/live
#- ${THRESHOLD_TEMPLATE_DIR}:/code/conf/templates
- ${THRESHOLD_CERT_DIR}:/code/conf/cert
ports:
- "${THRESHOLD_LISTENER_PORT}:${THRESHOLD_LISTENER_PORT}"
- "${THRESHOLD_RELAY_PORT}:${THRESHOLD_RELAY_PORT}"
- "${THRESHOLD_API_PORT}:${THRESHOLD_API_PORT}"
env_file:
- ../stack.env
volumes_from:
- tmp
tmp:
image: busybox
command: chmod -R 777 /var/run/redis
volumes:
- /var/run/redis
redis:
image: redis
command: redis-server /etc/redis.conf
volumes:
- ${PORTAINER_GIT_DIR}/docker/redis.conf:/etc/redis.conf
volumes_from:
- tmp
networks:
default:
external:
name: pathogen

View File

@@ -4,7 +4,6 @@ from os import urandom
from os.path import exists
from string import digits
import redis
from redis import StrictRedis
# List of errors ZNC can give us
@@ -122,7 +121,7 @@ def initConf():
def initMain():
global r, g, x
global r, g
initConf()
r = StrictRedis(
unix_socket_path=config["RedisSocket"], db=config["RedisDBEphemeral"] # noqa
@@ -130,5 +129,3 @@ def initMain():
g = StrictRedis(
unix_socket_path=config["RedisSocket"], db=config["RedisDBPersistent"]
) # noqa
# SSDB for communication with Monolith
x = redis.from_url("redis://ssdb:1289", db=0)

View File

@@ -67,7 +67,7 @@ def parsemeta(numName, c):
def queue_message(c):
message = json.dumps(c)
main.x.lpush("queue", message)
main.g.sadd("queue", message)
def event(

9
legacy/requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
wheel
pre-commit
twisted
pyOpenSSL
redis
pyYaML
service_identity
siphashc
Klein

View File

@@ -98,6 +98,7 @@ class IRCRelayFactory(ReconnectingClientFactory):
self.relayCommands, self.user, self.stage2 = relayCommands, user, stage2
def buildProtocol(self, addr):
entry = IRCRelay(self.num, self.relayCommands, self.user, self.stage2)
self.client = entry

View File

@@ -1,10 +1,8 @@
import asyncio
from os import getenv
from time import sleep
import uvloop
import db
import util
from sources.ch4 import Chan4
from sources.dis import DiscordClient
@@ -23,28 +21,14 @@ if not token:
async def main(loop):
if "ingest" in modules_enabled:
ingest = Ingest()
loop.create_task(ingest.run())
if "dis" in modules_enabled:
client = DiscordClient()
loop.create_task(client.start(token))
if "ch4" in modules_enabled:
chan = Chan4()
loop.create_task(chan.run())
created = False
while not created:
try:
db.create_index(db.api_client)
created = True
except Exception as e:
print(f"Error creating index: {e}")
sleep(1) # Block the thread, just wait for the DB
db.update_schema()
ingest = Ingest()
loop.create_task(ingest.run())
loop = asyncio.get_event_loop()

0
oom
View File

View File

View File

@@ -1,134 +0,0 @@
import asyncio
import time
import psutil
import util
class DynamicThrottle(object):
def __init__(self, **kwargs):
self.target_cpu_usage = kwargs.get("target_cpu_usage", 50)
self.sleep_interval = 0.0
self.sleep_increment = kwargs.get("sleep_increment", 0.01)
self.sleep_decrement = kwargs.get("sleep_decrement", 0.01)
self.sleep_max = kwargs.get("sleep_max", 0.1)
self.sleep_min = kwargs.get("sleep_min", 0.01)
self.psutil_interval = kwargs.get("psutil_interval", 0.1)
self.log = kwargs.get("log", util.get_logger(self.__class__.__name__))
self.consecutive_increments = 0
self.consecutive_decrements = 0
self.consecutive_divisor = kwargs.get("consecutive_divisor", 1)
self.last_was_increment = kwargs.get("start_increment", True)
if kwargs.get("use_async"):
self.wait = self.dynamic_throttle_async
else:
self.wait = self.dynamic_throttle
async def dynamic_throttle_async(self):
"""
Dynamically sleeps before a request if CPU usage is above our target.
"""
current_cpu_usage = psutil.cpu_percent(interval=self.psutil_interval)
if current_cpu_usage > self.target_cpu_usage:
if self.last_was_increment:
self.consecutive_increments += 1
# self.log.debug(f"High CPU consecutive increments: {self.consecutive_increments}")
else:
self.consecutive_increments = 0 # ?
self.consecutive_decrements = 0 # ?
# self.log.debug(f"High CPU alert reset.")
self.sleep_interval += self.sleep_increment * (
max(1, self.consecutive_increments) / self.consecutive_divisor
)
self.last_was_increment = True
if self.sleep_interval > self.sleep_max:
self.sleep_interval = self.sleep_max
# self.log.debug(f"High CPU, but not increasing above {self.sleep_max:.3f}s")
# self.log.debug(
# f"High CPU: {current_cpu_usage}% > {self.target_cpu_usage}%, "
# f"=> sleep {self.sleep_interval:.3f}s"
# )
elif current_cpu_usage < self.target_cpu_usage:
if not self.last_was_increment:
self.consecutive_decrements += 1
# self.log.debug(f"Low CPU consecutive decrements: {self.consecutive_decrements}")
else:
self.consecutive_decrements = 0 # ?
self.consecutive_increments = 0 # ?
# self.log.debug(f"Low CPU alert reset.")
self.sleep_interval -= self.sleep_decrement * (
max(1, self.consecutive_decrements) / self.consecutive_divisor
)
self.last_was_increment = False
if self.sleep_interval < self.sleep_min:
self.sleep_interval = self.sleep_min
# self.log.debug(f"Low CPU, but not decreasing below {self.sleep_min:.3f}s")
# self.log.debug(
# f"Low CPU: {current_cpu_usage}% < {self.target_cpu_usage}%, "
# f"=> sleep {self.sleep_interval:.3f}s"
# )
if self.sleep_interval > 0:
await asyncio.sleep(self.sleep_interval)
return self.sleep_interval
return 0.0
def dynamic_throttle(self):
"""
Dynamically sleeps before a request if CPU usage is above our target.
"""
current_cpu_usage = psutil.cpu_percent(interval=self.psutil_interval)
if current_cpu_usage > self.target_cpu_usage:
if self.last_was_increment:
self.consecutive_increments += 1
# self.log.debug(f"High CPU consecutive increments: {self.consecutive_increments}")
else:
self.consecutive_increments = 0 # ?
self.consecutive_decrements = 0 # ?
# self.log.debug(f"High CPU alert reset.")
self.sleep_interval += self.sleep_increment * (
max(1, self.consecutive_increments) / self.consecutive_divisor
)
self.last_was_increment = True
if self.sleep_interval > self.sleep_max:
self.sleep_interval = self.sleep_max
# self.log.debug(f"High CPU, but not increasing above {self.sleep_max:.3f}s")
# self.log.debug(
# f"High CPU: {current_cpu_usage}% > {self.target_cpu_usage}%, "
# f"=> sleep {self.sleep_interval:.3f}s"
# )
elif current_cpu_usage < self.target_cpu_usage:
if not self.last_was_increment:
self.consecutive_decrements += 1
# self.log.debug(f"Low CPU consecutive decrements: {self.consecutive_decrements}")
else:
self.consecutive_decrements = 0 # ?
self.consecutive_increments = 0 # ?
# self.log.debug(f"Low CPU alert reset.")
self.sleep_interval -= self.sleep_decrement * (
max(1, self.consecutive_decrements) / self.consecutive_divisor
)
self.last_was_increment = False
if self.sleep_interval < self.sleep_min:
self.sleep_interval = self.sleep_min
# self.log.debug(f"Low CPU, but not decreasing below {self.sleep_min:.3f}s")
# self.log.debug(
# f"Low CPU: {current_cpu_usage}% < {self.target_cpu_usage}%, "
# f"=> sleep {self.sleep_interval:.3f}s"
# )
if self.sleep_interval > 0:
time.sleep(self.sleep_interval)
return self.sleep_interval
return 0.0

View File

@@ -1 +0,0 @@
# Resample 1Min into 5Min, 15Min, 30Min, 1H, 4H, 1D, 1W, 1M, 1Y

View File

@@ -14,7 +14,7 @@ from concurrent.futures import ProcessPoolExecutor
# For timestamp processing
from datetime import datetime
from os import getenv
from math import ceil
import orjson
import regex
@@ -34,6 +34,7 @@ from gensim.parsing.preprocessing import ( # stem_text,
strip_short,
strip_tags,
)
from numpy import array_split
from polyglot.detect.base import logger as polyglot_logger
# For NLP
@@ -47,21 +48,9 @@ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import db
import util
# For throttling
from perf.throttle import DynamicThrottle
# 4chan schema
from schemas.ch4_s import ATTRMAP
trues = ("true", "1", "t", True)
KEYNAME = "queue"
MONOLITH_PROCESS_PERFSTATS = (
getenv("MONOLITH_PROCESS_PERFSTATS", "false").lower() in trues
)
TARGET_CPU_USAGE = float(os.getenv("MONOLITH_PROCESS_TARGET_CPU_USAGE", 50.0))
CUSTOM_FILTERS = [
lambda x: x.lower(),
strip_tags, #
@@ -92,19 +81,6 @@ CPU_THREADS = int(os.getenv("MONOLITH_PROCESS_THREADS", os.cpu_count()))
p = ProcessPoolExecutor(CPU_THREADS)
throttle = DynamicThrottle(
target_cpu_usage=TARGET_CPU_USAGE,
sleep_increment=0.02,
sleep_decrement=0.01,
sleep_max=0.5,
sleep_min=0,
psutil_interval=0.1,
consecutive_divisor=2,
log=log,
start_increment=True,
use_async=False,
)
def get_hash_key():
hash_key = db.r.get("hashing_key")
@@ -123,44 +99,38 @@ hash_key = get_hash_key()
@asyncio.coroutine
async def spawn_processing_threads(chunk, length):
log.debug(f"Spawning processing threads for chunk {chunk} of length {length}")
async def spawn_processing_threads(data):
len_data = len(data)
loop = asyncio.get_event_loop()
tasks = []
if length < CPU_THREADS * 100:
cores = 1
chunk_size = length
if len(data) < CPU_THREADS * 100:
split_data = [data]
else:
cores = CPU_THREADS
chunk_size = int(length / cores)
for index in range(cores):
log.debug(
f"[{chunk}/{index}] Delegating {chunk_size} messages to thread {index}"
)
task = loop.run_in_executor(p, process_data, chunk, index, chunk_size)
msg_per_core = int(len(data) / CPU_THREADS)
split_data = array_split(data, ceil(len(data) / msg_per_core))
for index, split in enumerate(split_data):
log.debug(f"Delegating processing of {len(split)} messages to thread {index}")
task = loop.run_in_executor(p, process_data, split)
tasks.append(task)
results = [await task for task in tasks]
log.debug(
(
f"Results from processing of {len_data} messages in "
f"{len(split_data)} threads: {len(results)}"
)
)
# Join the results back from the split list
flat_list = [item for sublist in results for item in sublist]
total_messages = len(flat_list)
log.info(
(
f"[{chunk}/{index}] Results from processing of {length} messages in "
f"{cores} threads: {len(flat_list)}"
)
)
await db.store_batch(flat_list)
return total_messages
await db.store_kafka_batch(flat_list)
# log.debug(f"Finished processing {len_data} messages")
def process_data(chunk, index, chunk_size):
log.debug(f"[{chunk}/{index}] Processing {chunk_size} messages")
def process_data(data):
to_store = []
sentiment_time = 0.0
@@ -169,38 +139,15 @@ def process_data(chunk, index, chunk_size):
date_time = 0.0
nlp_time = 0.0
normalise_time = 0.0
hash_time = 0.0
normal2_time = 0.0
soup_time = 0.0
sleep_time = 0.0
total_time = 0.0
# Initialise sentiment analyser
analyzer = SentimentIntensityAnalyzer()
for msg_index in range(chunk_size):
# Print percentage of msg_index relative to chunk_size
if msg_index % 10 == 0:
percentage_done = (msg_index / chunk_size) * 100
log.debug(
f"[{chunk}/{index}] {percentage_done:.2f}% done ({msg_index}/{chunk_size})"
)
msg = db.r.rpop(KEYNAME)
if not msg:
return
msg = orjson.loads(msg)
if msg["src"] == "4ch":
board = msg["net"]
thread = msg["channel"]
redis_key = (
f"cache.{board}.{thread}.{msg['no']}.{msg['resto']}.{msg['now']}"
)
key_content = db.r.get(redis_key)
if key_content is not None:
continue
db.r.set(redis_key, "1")
for msg in data:
total_start = time.process_time()
# normalise fields
start = time.process_time()
@@ -226,6 +173,24 @@ def process_data(chunk, index, chunk_size):
board = msg["net"]
thread = msg["channel"]
# Calculate hash for post
start = time.process_time()
post_normalised = orjson.dumps(msg, option=orjson.OPT_SORT_KEYS)
hash = siphash(hash_key, post_normalised)
hash = str(hash)
redis_key = f"cache.{board}.{thread}.{msg['no']}"
key_content = db.r.get(redis_key)
if key_content:
key_content = key_content.decode("ascii")
if key_content == hash:
# This deletes the message since the append at the end won't be hit
continue
else:
msg["type"] = "update"
db.r.set(redis_key, hash)
time_took = (time.process_time() - start) * 1000
hash_time += time_took
start = time.process_time()
for key2, value in list(msg.items()):
if key2 in ATTRMAP:
@@ -243,10 +208,9 @@ def process_data(chunk, index, chunk_size):
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M:%S")
else:
old_ts = datetime.strptime(old_time, "%m/%d/%y(%a)%H:%M")
# iso_ts = old_ts.isoformat()
# new_ts = old_ts.isoformat()
new_ts = int(old_ts.timestamp())
msg["ts"] = new_ts
# msg["iso"] = iso_ts
else:
raise Exception("No TS in msg")
time_took = (time.process_time() - start) * 1000
@@ -272,7 +236,7 @@ def process_data(chunk, index, chunk_size):
msg["lang_code"] = lang_code
msg["lang_name"] = lang_name
except cld2_error as e:
log.error(f"[{chunk}/{index}] Error detecting language: {e}")
log.error(f"Error detecting language: {e}")
# So below block doesn't fail
lang_code = None
time_took = (time.process_time() - start) * 1000
@@ -291,7 +255,7 @@ def process_data(chunk, index, chunk_size):
# Tokens
start = time.process_time()
tokens = preprocess_string(msg["msg"], CUSTOM_FILTERS)
msg["tokens"] = str(tokens)
msg["tokens"] = tokens
# n = nlp(msg["msg"])
# for tag in TAGS:
# tag_name = tag.lower()
@@ -303,25 +267,17 @@ def process_data(chunk, index, chunk_size):
# Add the mutated message to the return buffer
to_store.append(msg)
total_time += (time.process_time() - total_start) * 1000
# Dynamic throttling to reduce CPU usage
if msg_index % 5 == 0:
sleep_time += throttle.wait()
if MONOLITH_PROCESS_PERFSTATS:
log.info("=====================================")
log.info(f"Chunk: {chunk}")
log.info(f"Index: {index}")
log.info(f"Sentiment: {sentiment_time}")
log.info(f"Regex: {regex_time}")
log.info(f"Polyglot: {polyglot_time}")
log.info(f"Date: {date_time}")
log.info(f"NLP: {nlp_time}")
log.info(f"Normalise: {normalise_time}")
log.info(f"Normal2: {normal2_time}")
log.info(f"Soup: {soup_time}")
log.info(f"Total: {total_time}")
log.info(f"Throttling: {sleep_time}")
log.info("=====================================")
log.debug("=====================================")
log.debug(f"Sentiment: {sentiment_time}")
log.debug(f"Regex: {regex_time}")
log.debug(f"Polyglot: {polyglot_time}")
log.debug(f"Date: {date_time}")
log.debug(f"NLP: {nlp_time}")
log.debug(f"Normalise: {normalise_time}")
log.debug(f"Hash: {hash_time}")
log.debug(f"Normal2: {normal2_time}")
log.debug(f"Soup: {soup_time}")
log.debug(f"Total: {total_time}")
log.debug("=====================================")
return to_store

View File

@@ -1,13 +1,14 @@
wheel
pre-commit
beautifulsoup4
redis
siphashc
aiohttp[speedups]
python-dotenv
manticoresearch
#manticoresearch
numpy
aioredis[hiredis]
#aiokafka
aiokafka
vaderSentiment
polyglot
pyicu
@@ -20,10 +21,4 @@ gensim
python-Levenshtein
orjson
uvloop
elasticsearch[async]
msgpack
# flpc
psutil
pymexc
websockets
aiomysql
numba

186
rts.py
View File

@@ -1,186 +0,0 @@
import asyncio
import logging
from os import getenv
import orjson
import websockets
import db
# Logger setup
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("RTS")
# Environment variables
MONOLITH_RTS_MEXC_API_ACCESS_KEY = getenv("MONOLITH_RTS_MEXC_API_ACCESS_KEY", None)
MONOLITH_RTS_MEXC_API_SECRET_KEY = getenv("MONOLITH_RTS_MEXC_API_SECRET_KEY", None)
# WebSocket endpoint
MEXC_WS_URL = "wss://wbs.mexc.com/ws"
{
"d": {
"e": "spot@public.kline.v3.api",
"k": {
"t": 1737901140, # TS
"o": "684.4", # Open
"c": "684.5", # Close
"h": "684.5", # High
"l": "684.4", # Low
"v": "0.173", # Volume of the base
"a": "118.41", # Volume of the quote (Quantity)
"T": 1737901200, # ?
"i": "Min1", # ?
},
},
"c": "spot@public.kline.v3.api@BNBUSDT@Min1", # Channel
"t": 1737901159239,
"s": "BNBUSDT", # Symbol
}
# Scan DB for last endtime (T)
# Request Kline data from last endtime (T) to now
# Check Server Time
# Response
# {
# "serverTime" : 1645539742000
# }
# GET /api/v3/time
# Weight(IP): 1
# Parameter:
# NONE
# Kline/Candlestick Data
# Response
# [
# [
# 1640804880000,
# "47482.36",
# "47482.36",
# "47416.57",
# "47436.1",
# "3.550717",
# 1640804940000,
# "168387.3"
# ]
# ]
# GET /api/v3/klines
# Weight(IP): 1
# Kline/candlestick bars for a symbol. Klines are uniquely identified by their open time.
# Parameters:
# Name Type Mandatory Description
# symbol string YES
# interval ENUM YES ENUM: Kline Interval
# startTime long NO
# endTime long NO
# limit integer NO Default 500; max 1000.
# Scrub function:
# For each record, ensure there are no time gaps
# When the 1m window goes over, the next t is always the last T.
# Check for gaps, and request all klines between those gaps to ensure a full DB, even with restarts.
# Idle jitter function - compare our time with server time.
# Compare ts with our time and print jitter. Add jitter warning to log and OHLC.
# High jitter may prevent us from getting the correct data for trading.
async def mex_handle(data):
message = orjson.loads(data)
# print(orjson.dumps(message, option=orjson.OPT_INDENT_2).decode("utf-8"))
if "code" in message:
if message["code"] == 0:
log.info("Control message received")
return
symbol = message["s"]
open = message["d"]["k"]["o"]
close = message["d"]["k"]["c"]
high = message["d"]["k"]["h"]
low = message["d"]["k"]["l"]
volume_base = message["d"]["k"]["v"] # ERROR IN API DOCS
volume_quote = message["d"]["k"]["a"] # > a bigDecimal volume
interval = message["d"]["k"]["i"]
start_time = message["d"]["k"]["t"] # > t long stratTime
end_time = message["d"]["k"]["T"] # > T long endTime
event_time = message["t"] # t long eventTime
index = f"mex_ohlc_{symbol.lower()}"
reformatted = {
"s": symbol,
"o": float(open),
"c": float(close),
"h": float(high),
"l": float(low),
"v": float(volume_base),
"a": float(volume_quote),
"i": interval,
"t": int(start_time),
"t2": int(end_time),
"ts": int(event_time),
}
await db.rts_store_message(index, reformatted)
print(index)
print(orjson.dumps(reformatted, option=orjson.OPT_INDENT_2).decode("utf-8"))
print()
# Kline WebSocket handler
async def mex_main():
await db.init_mysql_pool()
async with websockets.connect(MEXC_WS_URL) as websocket:
log.info("WebSocket connected")
# Define symbols and intervals
symbols = ["BTCUSDT"] # Add more symbols as needed
interval = "Min1" # Kline interval
# Prepare subscription requests for Kline streams
# Request: spot@public.kline.v3.api@<symbol>@<interval>
subscriptions = [
f"spot@public.kline.v3.api@{symbol}@{interval}" for symbol in symbols
]
# Send subscription requests
subscribe_request = {
"method": "SUBSCRIPTION",
"params": subscriptions,
# "id": 1,
}
await websocket.send(orjson.dumps(subscribe_request).decode("utf-8"))
log.info(f"Subscribed to: {subscriptions}")
# Listen for messages
while True:
try:
message = await websocket.recv()
await mex_handle(message)
except websockets.exceptions.ConnectionClosed as e:
log.error(f"WebSocket connection closed: {e}")
break
# Entry point
if __name__ == "__main__":
try:
asyncio.run(mex_main())
except KeyboardInterrupt:
log.info("RTS process terminated.")

View File

@@ -129,19 +129,8 @@ schema_main = {
"version_sentiment": "int",
# 1, 2
"version_tokens": "int",
# en, ru
"lang_code": "string indexed attribute",
"lang_name": "text",
"match_ts": "timestamp",
"batch_id": "bigint",
"rule_id": "bigint",
"index": "string indexed attribute",
"meta": "text",
# "iso": "string indexed attribute",
}
schema_rule_storage = schema_main
schema_meta = {
"id": "bigint",
# 393598265, #main, Rust Programmer's Club

View File

@@ -10,7 +10,6 @@ from numpy import array_split
import db
import util
from perf.throttle import DynamicThrottle
# CONFIGURATION #
@@ -26,12 +25,6 @@ CRAWL_DELAY = int(getenv("MONOLITH_CH4_CRAWL_DELAY", 5))
# Semaphore value ?
THREADS_SEMAPHORE = int(getenv("MONOLITH_CH4_THREADS_SEMAPHORE", 1000))
# Target CPU usage percentage
TARGET_CPU_USAGE = float(getenv("MONOLITH_CH4_TARGET_CPU_USAGE", 50.0))
# Boards to crawl
BOARDS = getenv("MONOLITH_CH4_BOARDS", "").split(",")
# CONFIGURATION END #
@@ -44,19 +37,6 @@ class Chan4(object):
name = self.__class__.__name__
self.log = util.get_logger(name)
self.throttle = DynamicThrottle(
target_cpu_usage=TARGET_CPU_USAGE,
sleep_increment=0.01,
sleep_decrement=0.01,
sleep_max=0.1,
sleep_min=0,
psutil_interval=0.1,
log=self.log,
start_increment=False,
use_async=True,
)
self.wait = self.throttle.wait
self.api_endpoint = "https://a.4cdn.org"
# self.boards = ["out", "g", "a", "3", "pol"] #
self.boards = []
@@ -73,14 +53,12 @@ class Chan4(object):
self.log.debug(f"Created new hash key: {self.hash_key}")
db.r.set("hashing_key", self.hash_key)
else:
self.hash_key = self.hash_key.decode("ascii")
self.log.debug(f"Decoded hash key: {self.hash_key}")
async def run(self):
if "ALL" in BOARDS:
await self.get_board_list()
else:
self.boards = BOARDS
while True:
await self.get_thread_lists(self.boards)
await asyncio.sleep(CRAWL_DELAY)
@@ -93,37 +71,29 @@ class Chan4(object):
for board in response["boards"]:
self.boards.append(board["board"])
self.log.debug(f"Got boards: {self.boards}")
# await self.dynamic_throttle()
# TODO
async def get_thread_lists(self, boards):
# self.log.debug(f"Getting thread list for {boards}")
board_urls = {board: f"{board}/threads.json" for board in boards}
board_urls = {board: f"{board}/catalog.json" for board in boards}
responses = await self.api_call(board_urls)
to_get = []
flat_map = [board for board, thread in responses]
self.log.debug(f"Got thread list for {len(responses)} boards: {flat_map}")
for board, response in responses:
self.log.debug(f"Got thread list for {flat_map}: {len(responses)}")
for mapped, response in responses:
if not response:
continue
for page in response:
for threads in page["threads"]:
no = threads["no"]
to_get.append((board, no))
# await self.dynamic_throttle()
# TODO
to_get.append((mapped, no))
if not to_get:
return
self.log.debug(f"Got {len(to_get)} threads to fetch")
split_threads = array_split(to_get, ceil(len(to_get) / THREADS_CONCURRENT))
self.log.debug(f"Split threads into {len(split_threads)} series")
for index, thr in enumerate(split_threads):
self.log.debug(f"Series {index} - getting {len(thr)} threads")
await self.get_threads_content(thr)
# await self.dynamic_throttle()
# TODO
for threads in split_threads:
await self.get_threads_content(threads)
await asyncio.sleep(THREADS_DELAY)
# await self.get_threads_content(to_get)
def take_items(self, dict_list, n):
i = 0
@@ -153,8 +123,6 @@ class Chan4(object):
continue
board, thread = mapped
all_posts[mapped] = response["posts"]
# await self.dynamic_throttle()
# TODO
if not all_posts:
return
@@ -164,16 +132,14 @@ class Chan4(object):
to_store = []
for key, post_list in posts.items():
board, thread = key
for post in post_list:
post["type"] = "msg"
for index, post in enumerate(post_list):
posts[key][index]["type"] = "msg"
post["src"] = "4ch"
post["net"] = board
post["channel"] = thread
posts[key][index]["src"] = "4ch"
posts[key][index]["net"] = board
posts[key][index]["channel"] = thread
to_store.append(post)
# await self.dynamic_throttle()
# TODO
to_store.append(posts[key][index])
if to_store:
await db.queue_message_bulk(to_store)
@@ -188,7 +154,6 @@ class Chan4(object):
async def bound_fetch(self, sem, url, session, mapped):
# Getter function with semaphore.
async with sem:
await self.wait()
try:
return await self.fetch(url, session, mapped)
except: # noqa

View File

@@ -1,6 +1,8 @@
import asyncio
from os import getenv
import orjson
import db
import util
from processing import process
@@ -11,22 +13,13 @@ KEYNAME = "queue"
CHUNK_SIZE = int(getenv("MONOLITH_INGEST_CHUNK_SIZE", "900"))
ITER_DELAY = float(getenv("MONOLITH_INGEST_ITER_DELAY", "0.5"))
INGEST_INCREASE_BELOW = int(getenv("MONOLITH_INGEST_INCREASE_BELOW", "2500"))
INGEST_DECREASE_ABOVE = int(getenv("MONOLITH_INGEST_DECREASE_ABOVE", "10000"))
INGEST_INCREASE_BY = int(getenv("MONOLITH_INGEST_INCREASE_BY", "100"))
INGEST_DECREASE_BY = int(getenv("MONOLITH_INGEST_DECREASE_BY", "100"))
log = util.get_logger("ingest")
INGEST_MAX = int(getenv("MONOLITH_INGEST_MAX", "1000000"))
INGEST_MIN = int(getenv("MONOLITH_INGEST_MIN", "100"))
class Ingest(object):
def __init__(self):
name = self.__class__.__name__
self.log = util.get_logger(name)
self.current_chunk = 0
self.log.info(
(
"Starting ingest handler for chunk size of "
@@ -37,45 +30,17 @@ class Ingest(object):
async def run(self):
while True:
await self.get_chunk()
self.log.debug(f"Ingest chunk {self.current_chunk} complete")
self.current_chunk += 1
await asyncio.sleep(ITER_DELAY)
async def get_chunk(self):
global CHUNK_SIZE
length = await db.ar.llen(KEYNAME)
if length > CHUNK_SIZE:
length = CHUNK_SIZE
if not length:
items = []
# for source in SOURCES:
# key = f"{KEYPREFIX}{source}"
chunk = await db.ar.spop(KEYNAME, CHUNK_SIZE)
if not chunk:
return
ingested = await process.spawn_processing_threads(self.current_chunk, length)
if ingested < INGEST_INCREASE_BELOW:
if CHUNK_SIZE + INGEST_INCREASE_BY < INGEST_MAX:
self.log.debug(
(
f"Increasing chunk size to "
f"{CHUNK_SIZE + INGEST_INCREASE_BY} "
f"due to low ingestion ({ingested})"
)
)
CHUNK_SIZE += INGEST_INCREASE_BY
else:
log.debug(
f"Chunk size ({CHUNK_SIZE}) at maximum, not increasing above: {INGEST_MAX}"
)
elif ingested > INGEST_DECREASE_ABOVE:
if CHUNK_SIZE - INGEST_DECREASE_BY > INGEST_MIN:
self.log.debug(
(
f"Decreasing chunk size to "
f"{CHUNK_SIZE - INGEST_DECREASE_BY}"
f"due to high ingestion ({ingested})"
)
)
CHUNK_SIZE -= INGEST_DECREASE_BY
else:
log.debug(
f"Chunk size ({CHUNK_SIZE}) at minimum, not decreasing below: {INGEST_MIN}"
)
for item in chunk:
item = orjson.loads(item)
items.append(item)
if items:
await process.spawn_processing_threads(items)

View File

@@ -43,6 +43,7 @@ class ColoredFormatter(logging.Formatter):
def get_logger(name):
# Define the logging format
FORMAT = "%(asctime)s %(levelname)18s $BOLD%(name)13s$RESET - %(message)s"
COLOR_FORMAT = formatter_message(FORMAT, True)