Training NLP Model for Twitter/X Crypto Analysis
Twitter/X is fastest crypto information distribution environment. Influencers with millions of followers, anonymous analysts, project employees — all communicate here. Model capable of analyzing this stream in real-time catches signals before they reflect in price.
Twitter API v2 for Crypto Data
import tweepy
from datetime import datetime, timedelta
class TwitterCryptoCollector:
def __init__(self, bearer_token):
self.client = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)
def search_recent(self, query, max_results=100):
"""
Twitter API v2 Basic: up to 500k tweets/month
Academic: up to 10M tweets/month + historical access
"""
tweets = self.client.search_recent_tweets(
query=query + ' lang:en -is:retweet',
max_results=min(max_results, 100),
tweet_fields=['created_at', 'public_metrics', 'author_id'],
expansions=['author_id'],
user_fields=['public_metrics', 'verified']
)
return tweets
def get_crypto_stream(self, keywords):
"""Filtered stream for realtime monitoring"""
rules = [tweepy.StreamRule(kw) for kw in keywords]
self.client.add_stream_rules(rules)
# Launch stream in separate thread
stream = self.CryptoStreamListener(self.client)
stream.filter(tweet_fields=['public_metrics', 'created_at'])
CRYPTO_QUERIES = {
'BTC': '(bitcoin OR btc OR #BTC OR #Bitcoin) crypto',
'ETH': '(ethereum OR eth OR #ETH OR #Ethereum) crypto',
'GENERAL': '#crypto OR #cryptocurrency OR #altcoins'
}
Twitter NLP Specifics
Tweets short (280 characters), contain slang, cashtags ($BTC), emoji, abbreviations. Standard NLP models perform poorly without fine-tuning.
Preprocessing:
import re
from emoji import demojize
def preprocess_tweet(text):
# Replace emoji with text description
text = demojize(text)
# Normalize cashtags
text = re.sub(r'\$([A-Z]{2,6})', r'TOKEN_\1', text)
# Remove URLs
text = re.sub(r'http\S+', '[URL]', text)
# Normalize mentions
text = re.sub(r'@\w+', '[USER]', text)
# Crypto-specific replacements
crypto_slang = {
'hodl': 'hold',
'rekt': 'ruined',
'wen': 'when',
'gm': 'good morning',
'ngmi': 'not going to make it',
'wagmi': 'we are all going to make it',
'degen': 'degenerate speculator',
'ape': 'invest blindly'
}
for slang, replacement in crypto_slang.items():
text = re.sub(rf'\b{slang}\b', replacement, text, flags=re.IGNORECASE)
return text
Tweet-BERT: Model for Crypto Tweets
BERTweet: pre-trained BERT on 850M English tweets. Best baseline for Twitter NLP.
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
class CryptoTweetAnalyzer:
def __init__(self):
# BERTweet fine-tuned on crypto tweets
self.tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
self.model = AutoModelForSequenceClassification.from_pretrained(
'./crypto_bertweet_finetuned'
)
self.model.eval()
def analyze(self, tweet_text):
preprocessed = preprocess_tweet(tweet_text)
inputs = self.tokenizer(
preprocessed,
max_length=128,
truncation=True,
return_tensors='pt'
)
with torch.no_grad():
logits = self.model(**inputs).logits
probs = torch.softmax(logits, -1)[0]
return {
'bullish': probs[0].item(),
'bearish': probs[1].item(),
'neutral': probs[2].item(),
'score': probs[0].item() - probs[1].item()
}
Influence Weighting
Not all tweets equally important. Weight by author influence:
def calculate_author_influence(user_metrics):
followers = user_metrics.get('followers_count', 0)
following = user_metrics.get('following_count', 1)
tweets_count = user_metrics.get('tweet_count', 1)
# Follower/Following ratio (audience quality)
ff_ratio = followers / following
# Log-scaled follower count
log_followers = np.log1p(followers)
# Verified account bonus
verified_bonus = 1.5 if user_metrics.get('verified', False) else 1.0
influence_score = log_followers * np.sqrt(ff_ratio) * verified_bonus
return min(influence_score / 20, 10) # normalize to [0, 10]
def weighted_sentiment(tweets_with_scores):
total_weight = sum(
t['score'] * t['engagement'] * t['author_influence']
for t in tweets_with_scores
)
total_normalizer = sum(
t['engagement'] * t['author_influence']
for t in tweets_with_scores
)
return total_weight / max(total_normalizer, 1)
Viral Content Detection
Fast-spreading tweet can signal market opportunity:
def detect_viral_tweet(tweet, time_window_minutes=30, viral_threshold=500):
"""If retweet rate > viral_threshold in 30 minutes → viral"""
retweets = tweet['public_metrics']['retweet_count']
age_minutes = (datetime.utcnow() - tweet['created_at']).total_seconds() / 60
if age_minutes < time_window_minutes:
projected_retweets = retweets / age_minutes * time_window_minutes
if projected_retweets > viral_threshold:
return True, projected_retweets
return False, 0
KOL (Key Opinion Leaders) Monitoring
Separate module: monitoring tweets from influential accounts list:
- Vitalik Buterin, CZ, SBF (historically), Elon Musk
- Analysts: PlanB, Willy Woo, IntoTheBlock team
- Protocol founders
Any KOL tweet → immediate alert with sentiment analysis.
Stack: Python asyncio for Twitter Stream API, Redis for deduplication and caching, Kafka for high-flow buffering, PostgreSQL for storage, GPU inference server for batch classification, React dashboard.
Developing Twitter monitoring system with BERTweet fine-tuning, influence weighting, viral detection, KOL monitoring and realtime sentiment aggregation.







